In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
from sklearn.linear_model import LinearRegression
from datetime import datetime
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv("WasLetztePreis/WasLetztePreis_training.csv")

In [3]:
df

Unnamed: 0,id,preis,marke,model,bauform,kraftstoff,getriebe,ps,kilometerstand,erstzulassung,gebiet,verkaufsdatum
0,100000,1000.0,audi,80,limousine,benzin,manuell,90,150000,1994,25853,2016-03-20
1,100001,23750.0,volkswagen,golf,limousine,benzin,automatik,230,60000,2013,31812,2016-03-28
2,100002,6750.0,bmw,3er,coupe,benzin,automatik,192,100000,2001,24398,2016-04-07
3,100003,9750.0,volkswagen,kaefer,limousine,,,0,40000,1985,48163,2016-03-21
4,100004,19999.0,mercedes_benz,c_klasse,cabrio,benzin,manuell,156,40000,2011,74078,2016-03-17
...,...,...,...,...,...,...,...,...,...,...,...,...
249995,349995,,opel,corsa,kleinwagen,benzin,manuell,75,150000,2002,48429,2016-03-21
249996,349996,,honda,jazz,kleinwagen,benzin,manuell,99,50000,2012,22880,2016-03-28
249997,349997,,volkswagen,golf,kleinwagen,benzin,manuell,75,150000,1998,79713,2016-04-04
249998,349998,,audi,a3,limousine,diesel,manuell,140,125000,2007,83730,2016-04-07


In [4]:
traindata = df.loc[df["preis"].notnull()]

In [9]:
marken_dummies = pd.get_dummies(traindata["marke"]).add_prefix("marke_")
bauform_dummies = pd.get_dummies(traindata["bauform"]).add_prefix("bauform_")
kraftstoff_dummies = pd.get_dummies(traindata["kraftstoff"]).add_prefix("kraftstoff_")
getriebe_dummies = pd.get_dummies(traindata["getriebe"]).add_prefix("getriebe_")

In [13]:
train2 = pd.concat([traindata[["ps", "kilometerstand", "erstzulassung", "gebiet"]], marken_dummies, bauform_dummies, kraftstoff_dummies, getriebe_dummies], axis=1)

In [19]:

traindata = traindata.assign(verkaufsdatum_int = traindata["verkaufsdatum"].map(datum_to_int))

In [14]:
train2.columns

Index(['ps', 'kilometerstand', 'erstzulassung', 'gebiet', 'marke_alfa_romeo',
       'marke_audi', 'marke_bmw', 'marke_chevrolet', 'marke_chrysler',
       'marke_citroen', 'marke_dacia', 'marke_daewoo', 'marke_daihatsu',
       'marke_fiat', 'marke_ford', 'marke_honda', 'marke_hyundai',
       'marke_jaguar', 'marke_jeep', 'marke_kia', 'marke_lada', 'marke_lancia',
       'marke_land_rover', 'marke_mazda', 'marke_mercedes_benz', 'marke_mini',
       'marke_mitsubishi', 'marke_nissan', 'marke_opel', 'marke_peugeot',
       'marke_porsche', 'marke_renault', 'marke_rover', 'marke_saab',
       'marke_seat', 'marke_skoda', 'marke_smart', 'marke_sonstige_autos',
       'marke_subaru', 'marke_suzuki', 'marke_toyota', 'marke_trabant',
       'marke_volkswagen', 'marke_volvo', 'bauform_cabrio', 'bauform_coupe',
       'bauform_kleinwagen', 'bauform_kombi', 'bauform_limousine',
       'bauform_suv', 'kraftstoff_andere', 'kraftstoff_benzin',
       'kraftstoff_cng', 'kraftstoff_diesel', 'krafts

In [42]:
regr = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.2))

In [15]:
lr = LinearRegression()

X_train = train2.values
 
Y_train = traindata["preis"]
 
lr.fit(X_train,Y_train)  

LinearRegression()

In [17]:
testdata = df.loc[df["preis"].isnull()]

marken_dummies1 = pd.get_dummies(testdata["marke"]).add_prefix("marke_")
bauform_dummies1 = pd.get_dummies(testdata["bauform"]).add_prefix("bauform_")
kraftstoff_dummies1 = pd.get_dummies(testdata["kraftstoff"]).add_prefix("kraftstoff_")
getriebe_dummies1 = pd.get_dummies(testdata["getriebe"]).add_prefix("getriebe_")

test2 = pd.concat([testdata[["ps", "kilometerstand", "erstzulassung", "gebiet"]], marken_dummies1, bauform_dummies1, kraftstoff_dummies1, getriebe_dummies1], axis=1)

In [18]:

Y = lr.predict(test2.values)

In [19]:
solution = pd.DataFrame({"id": testdata["id"], "preis": Y})

In [20]:
svs = solution.values

svs[:10][2]

array([300002.        ,   1794.40496864])

In [21]:
with open("letztepreis_solution1.txt", "w") as f:
    f.write("50fd85bdaa1c8d7b910d4602121085fa2e62616635e7d9593ed80d721db68b90\n")
    f.write("wlp-test2\n")
    f.write("\n".join([str(int(line[0])) + " " + str(int(line[1])) for line in svs]) + "\n")