# Modeling

In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV, LassoCV, LogisticRegression
import datetime as dt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import LinearSVC

In [19]:
train = pd.read_csv('./train_01.csv')
test = pd.read_csv('./test_01.csv')

print(train.shape)
print(test.shape)

(211041, 152)
(62096, 152)


In [20]:
train['date_account_created'] = pd.to_datetime(train['date_account_created'])
test['date_account_created'] = pd.to_datetime(test['date_account_created'])

train['timestamp_first_active'] = pd.to_datetime(train['timestamp_first_active'])
test['timestamp_first_active'] = pd.to_datetime(test['timestamp_first_active'])


In [21]:
train['date_account_created'] = train['date_account_created'].map(dt.datetime.toordinal)
test['date_account_created'] = test['date_account_created'].map(dt.datetime.toordinal)

train['timestamp_first_active'] = train['timestamp_first_active'].map(dt.datetime.toordinal)
test['timestamp_first_active'] = test['timestamp_first_active'].map(dt.datetime.toordinal)


In [24]:
# include all features other than target var and features that could not possible create a better model (PID, Id)
features = train.drop(columns=['id', 'country_destination']).columns

X = train[features]

y = train['country_destination']


X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state=42)

In [32]:
ss = StandardScaler()
ss.fit(X_train)
Z_train = ss.transform(X_train)
Z_test = ss.transform(X_test)

In [43]:
# C values to GridSearch over
# DSI-US-12/6.05-lesson-support-vector-machines/blob/master/solution-code/solution-code.ipynb
pgrid = {"C": [0.01, 0.1]}

svc = LinearSVC()
gcv = GridSearchCV(svc, pgrid, cv=3, n_jobs=-1, verbose=3)
gcv.fit(Z_train, y_train);

gcv.best_score_

Fitting 3 folds for each of 2 candidates, totalling 6 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   6 | elapsed: 14.5min remaining: 28.9min
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed: 15.6min finished


0.8757771038665657

In [34]:
logreg = LogisticRegression(max_iter=1_000)

model = logreg.fit(Z_train, y_train)

print(model.score(Z_train, y_train))
print(model.score(Z_test, y_test))

0.8758718726307809
0.8774284035556567


In [35]:
# select corresponding features from test set, standardize values accoring to the fit on the train set
Z_test = ss.transform(test[features])
# predict sales prices, convert values to original scale from log scale
test['SalePrice'] = np.exp(model.predict(Z_test))
# eyeball predictions
test['SalePrice']

0      147685.413289
1      157253.396984
2      217337.353079
3      100089.546202
4      172686.922585
           ...      
873    187366.221970
874    219049.335917
875    126001.656625
876    117441.985082
877    128620.237424
Name: SalePrice, Length: 878, dtype: float64

In [36]:
# save submission csv
test[['Id', 'SalePrice']].to_csv(f'./output/sub48', index=False)
