In [1]:
import datetime 
import math
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
import seaborn as sns
%matplotlib inline

sns.set_style("white")

------------ 

# Data preparation

In [2]:
# read the data
cars = pd.read_csv("autos.csv", encoding="Latin1", parse_dates = ['dateCrawled','dateCreated','lastSeen'])

In [3]:
# clean the data:
# only these values make sense for car age:
cars = cars[(cars.yearOfRegistration < 2017) & (cars.yearOfRegistration > 1900)]
# we don't want to have non-sensible prices:
cars = cars[(cars.price < 500000) & (cars.price > 500)]
# only interested in working cars for now:
cars = cars[cars.notRepairedDamage != "ja"]

In [4]:
cars = cars.assign(mileage_cat=[("low", "medium", "med-high", "high")[min(3, int(math.floor(x/50000)))] for x in cars.kilometer])

In [5]:
# age is a better feature than year of registration
# here we use the number of days since registration
cars = cars.assign(age=[datetime.timedelta(seconds=(x.dateCreated.timestamp() - 
                        (datetime.datetime.strptime(
                            str(x.yearOfRegistration) + str(x.monthOfRegistration), "%Y%M")).timestamp())).days
                        for i, x in cars.iterrows()])

In [6]:
# only use cars not registered in the future
cars = cars[cars.age > 0]

In [7]:
# save the modified csv
cars.to_csv("autos.mod.csv")

In [5]:
# to start with cleaned & modified data:
cars = pd.read_csv("autos.mod.csv")

In [4]:
cars.offerType.value_counts()

Angebot    292692
Gesuch          3
Name: offerType, dtype: int64

In [10]:
#cars.plot(x="yearOfRegistration", y="price", kind="scatter", ylim=(0, 1000000))
plt.figure()
sns.lmplot('age', 'price', data=cars, fit_reg=False, hue="brand")
plt.xlim(0, 50000)

<matplotlib.figure.Figure at 0x7f08e80700b8>

<matplotlib.figure.Figure at 0x7f08e7e9c160>

(0, 50000)

In [11]:
# most common models
cars.model.value_counts()[:20]

golf           23435
andere         20893
3er            17214
polo            9131
a4              8674
passat          8482
corsa           8364
astra           8094
c_klasse        7784
5er             7485
e_klasse        6562
a3              5604
a6              5120
transporter     4720
focus           4708
fiesta          4011
2_reihe         4004
fortwo          3847
1er             3511
a_klasse        3483
Name: model, dtype: int64

In [46]:
# get general depreciation
from sklearn import linear_model
clf = linear_model.LinearRegression()
clf.fit(cars.loc[:, ("kilometer", "yearOfRegistration")], y=cars.price)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [47]:
clf.coef_

array([ -8.10373297e-02,   2.49144470e+02])

In [None]:
# compare depreciation per model

In [13]:
cars.yearOfRegistration.hist()

<matplotlib.figure.Figure at 0x7f08f25e4cc0>

<matplotlib.axes._subplots.AxesSubplot at 0x7f08f25ef6d8>

In [14]:

sns.lmplot('yearOfRegistration', 'price', data=cars[cars.model=="golf"], fit_reg=False, hue="mileage_cat")

<matplotlib.figure.Figure at 0x7f08e79dad68>

<seaborn.axisgrid.FacetGrid at 0x7f08e79f0b00>

In [15]:

sns.lmplot('yearOfRegistration', 'price', data=cars[cars.model=="1er"], fit_reg=False, hue="mileage_cat")

<matplotlib.figure.Figure at 0x7f08ef566860>

<seaborn.axisgrid.FacetGrid at 0x7f08efbd0d68>

In [16]:
sns.lmplot('yearOfRegistration', 'price', data=cars[cars.model=="3er"], fit_reg=False, hue="mileage_cat")

<matplotlib.figure.Figure at 0x7f08ef162c50>

<seaborn.axisgrid.FacetGrid at 0x7f08ef162d68>

In [17]:
sns.countplot(x="yearOfRegistration", hue="mileage_cat", data=cars[cars.model=="3er"])

<matplotlib.figure.Figure at 0x7f08ef55d5f8>

<matplotlib.axes._subplots.AxesSubplot at 0x7f08f29f8668>

In [69]:
# write function for fit parameters for one model
# run function for all models > 100 entries
# test accuracy for each
# see how good my accuracy is, maybe also depending on input data

In [10]:
def randomize(x, y, length):
    random_indices = list(range(length))
    random.shuffle(random_indices)
    x = x[random_indices]
    y = y[random_indices]
    return x, y


def calc_cost_grad(x, y, theta):
    pass


def cost(x, y, theta, lamb=0):
    import pdb; pdb.set_trace()
    m = x.shape[0]  # number of training examples
    h = np.dot(x, theta)  # predicted y values
    theta[0] = 0  # theta_0 does not count for regularization
    j = np.sum(((h - y[:,0]) ** 2))/(2*m) + (lamb/(2*m)) * sum(theta**2)
    return j
    

def normal_equation(train_x, train_y, lamb):
    # normal equation
    reg = np.zeros((train_x.shape[1], train_x.shape[1]))
    np.fill_diagonal(reg, 1)
    reg[0, 0] = 0
    reg = lamb * reg
    theta = np.dot(np.dot(np.linalg.pinv(np.dot(train_x.T, train_x) + reg),
                          train_x.T), train_y)
    return theta


def train_parameters(train_x, train_y, cv_x, cv_y):
    # choose lambda
    lambdas = [0, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1, 3, 10]
    train_errors, cv_errors = [], []
    train = normal_equation if train_x.shape[1] < 1000 else gradient_descent
    for lamb in lambdas:
        theta = train(train_x, train_y, lamb)
        print(theta)
        train_error = cost(train_x, train_y, theta)
        train_errors.append(train_error)
        cv_error = cost(cv_x, cv_y, theta)
        cv_errors.append(cv_error)
    print(train_errors)
    print(cv_errors)
    

def scale(x):
    mu = x.mean(axis=0)
    x = x - mu
    sigma = x.std(axis=0)
    x = x/sigma
    return x, mu, sigma


def fit_params(x, y):
    """Fit a linear regression to predict response vector y from feature matrix x.
    """
    x, y = x.as_matrix(), y.as_matrix()
    # feature scaling
    x, mu, s = scale(x)
    # first: randomize order
    len_x, len_y = x.shape[0], y.shape[0]
    x = np.insert(x, 0, 1, axis=1)
    if len_x != len_y:
        raise "Error: features x and response y have different lengths"
    x, y = randomize(x, y, len_x)
    # second: separate into training, cv, and test set
    div1, div2 = math.floor(len_x * 0.6), math.floor(len_x * 0.8)
    # train_x, train_y = x[:div1], y[:div1]
    # FIXME BAD BAD BAD
    train_x, train_y = x, y
    cv_x, cv_y = x[div1:div2], y[div1:div2]
    test_x, test_y = x[div2:], y[div2:]
    theta = train_parameters(train_x, train_y, cv_x, cv_y)


In [58]:
fit_params(cars[cars.model=="3er"].loc[:, ("kilometer", "age")], cars.price[cars.model=="3er"])

NameError: name 'simga' is not defined

In [13]:
fit_params(cars[cars.model=="3er"].loc[:, ("powerPS", "kilometer", "age")], cars.price[cars.model=="3er"])

[9098398.3775719255, 9098398.3988635354, 9098398.5687961429, 9098400.4868305102, 9098416.9718291946, 9098590.8619194534, 9099832.379682716, 9107643.3115832768, 9128716.580684524, 9171559.1681712363]
[33883438.350964755, 33883513.407945976, 33883663.297886319, 33884185.572185993, 33885658.010674044, 33890591.546619922, 33903040.524464712, 33933652.81191057, 33975171.015049942, 34027842.543601006]


In [6]:
cars.columns

Index(['Unnamed: 0', 'dateCrawled', 'name', 'seller', 'offerType', 'price',
       'abtest', 'vehicleType', 'yearOfRegistration', 'gearbox', 'powerPS',
       'model', 'kilometer', 'monthOfRegistration', 'fuelType', 'brand',
       'notRepairedDamage', 'dateCreated', 'nrOfPictures', 'postalCode',
       'lastSeen', 'mileage_cat', 'age'],
      dtype='object')

In [141]:
randrange(12)

5

In [153]:
a = cars.head().as_matrix()

In [182]:
a[0, 0] = 0

In [3]:
a = np.ones((4, 10))

In [4]:
a

array([[ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.]])

In [15]:
s = np.array([1,2,3,4,5,6,7,8,9,0])

In [16]:
a - s

array([[ 0., -1., -2., -3., -4., -5., -6., -7., -8.,  1.],
       [ 0., -1., -2., -3., -4., -5., -6., -7., -8.,  1.],
       [ 0., -1., -2., -3., -4., -5., -6., -7., -8.,  1.],
       [ 0., -1., -2., -3., -4., -5., -6., -7., -8.,  1.]])

In [90]:
# test functions using old stuff
df = pd.read_csv('/home/john/Dokumente/ml-course/ex1/ex1data2.txt', header=None)
x = df.loc[:, (0, 1)]
y = df.loc[:, (2)]
fit_params(x, y)

[ 340412.65957447  109447.79646964   -6578.35485416]
[ 340412.65957447  109444.29002461   -6576.2514397 ]
[ 340412.65957447  109437.27800558   -6572.04536699]
[ 340412.65957447  109412.74508065   -6557.33204883]
[ 340412.65957447  109342.7292621    -6515.36192204]
[ 340412.65957447  109098.58278554   -6369.2550373 ]
[ 340412.65957447  108408.70931444   -5958.46735445]
[ 340412.65957447  106080.02142222   -4594.75062444]
[ 340412.65957447  100076.79943067   -1251.04960997]
[ 340412.65957447   84366.39209005    6156.51271484]
[2043280050.6028287, 2043280054.832541, 2043280088.6638384, 2043280473.2541268, 2043283848.07846, 2043321997.973134, 2043651347.443651, 2047177580.5597289, 2073423160.6331103, 2260048691.2355566]
[63690980113.931343, 63691011814.758476, 63691075245.69796, 63691297561.221519, 63691935375.443687, 63694198124.265518, 63700917417.330719, 63727172500.195732, 63820769099.01017, 64253359197.004028]


In [11]:
# test functions using old stuff
import scipy.io as sio
d = sio.loadmat('/home/john/Dokumente/ml-course/ex5/ex5data1.mat')
X = np.insert(d["X"], 0, 1, axis=1)

cost(X, d["y"], np.array([1, 1]), 1)
#fit_params(x, y)

> <ipython-input-10-f41ec530fe7f>(15)cost()
-> m = x.shape[0]  # number of training examples
(Pdb) c


303.99319222026429

In [23]:
    reg = np.zeros((train_x.shape[1], train_x.shape[1]))
    np.fill_diagonal(reg, 1)
    reg[0, 0] = 0
    reg = lamb * reg
    theta = np.dot(np.dot(np.linalg.pinv(np.dot(train_x.T, train_x) + reg),
                          train_x.T), train_y)

Unnamed: 0,0,1
0,2104,3
1,1600,3
2,2400,3
3,1416,2
4,3000,4
5,1985,4
6,1534,3
7,1427,3
8,1380,3
9,1494,3


In [26]:
X = np.asmatrix(x)
X = np.insert(X, 0, 1, axis=1)

In [32]:
lamb = 0
reg = np.zeros((X.shape[1], X.shape[1]))
np.fill_diagonal(reg, 1)
reg[0, 0] = 0
reg = lamb * reg
np.dot(np.dot(np.linalg.pinv(np.dot(X.T, X) + reg), X.T), y)

matrix([[ 89597.90954435,    139.21067402,  -8738.01911278]])

In [60]:
train_parameters(X, y, X, y)

[[ 89597.90954435    139.21067402  -8738.01911278]]


ValueError: shapes (47,3) and (1,3) not aligned: 3 (dim 1) != 1 (dim 0)

In [56]:
X.std(axis=0)

matrix([[  0.00000000e+00,   7.86202619e+02,   7.52842809e-01]])

In [73]:
1677338815.5246389 - 1221281097.8796241

456057717.64501476

In [75]:
X / np.array([2, 100, 3])

matrix([[  0.5       ,  21.04      ,   1.        ],
        [  0.5       ,  16.        ,   1.        ],
        [  0.5       ,  24.        ,   1.        ],
        [  0.5       ,  14.16      ,   0.66666667],
        [  0.5       ,  30.        ,   1.33333333],
        [  0.5       ,  19.85      ,   1.33333333],
        [  0.5       ,  15.34      ,   1.        ],
        [  0.5       ,  14.27      ,   1.        ],
        [  0.5       ,  13.8       ,   1.        ],
        [  0.5       ,  14.94      ,   1.        ],
        [  0.5       ,  19.4       ,   1.33333333],
        [  0.5       ,  20.        ,   1.        ],
        [  0.5       ,  18.9       ,   1.        ],
        [  0.5       ,  44.78      ,   1.66666667],
        [  0.5       ,  12.68      ,   1.        ],
        [  0.5       ,  23.        ,   1.33333333],
        [  0.5       ,  13.2       ,   0.66666667],
        [  0.5       ,  12.36      ,   1.        ],
        [  0.5       ,  26.09      ,   1.33333333],
        [  0