# 3. Problem 1: Scratching train_test_split

In [1]:
def scratch_train_test_split(X,y,train_size=0.8,random_state=0):
    
    np.random.seed(random_state)
    y = y.reshape(-1,1)
    Xy = np.concatenate([X,y],axis=1)
    size = len(Xy)
    pick = int(np.round(size*train_size))
    train_pick = np.random.choice(np.arange(size),pick,replace=False)
    test_pick = np.delete(np.arange(size),train_pick)
    train = Xy[train_pick,:]
    test = Xy[test_pick,:]
    X_train = train[:,0:(Xy.shape[1]-y.shape[1])].reshape(-1,X.shape[1])
    y_train = train[:,-y.shape[1]].reshape(-1,)
    X_test = test[:,0:(Xy.shape[1]-y.shape[1])].reshape(-1,X.shape[1])
    y_test = test[:,-y.shape[1]].reshape(-1,)
    
    return X_train, X_test, y_train, y_test

In [2]:
import numpy as np
from sklearn.datasets import load_iris

data = load_iris()
X = data.data
y = data.target


In [3]:
X_train, X_test, y_train, y_test = scratch_train_test_split(X, y)
print(f'X', X.shape)
print(f'y', y.shape)
print(f'X_train', X_train.shape)
print(f'X_test', X_test.shape)
print(f'y_train', y_train.shape)
print(f'y_test', y_test.shape)

X (150, 4)
y (150,)
X_train (120, 4)
X_test (30, 4)
y_train (120,)
y_test (30,)


In [4]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [5]:
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(loss="log_loss")
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)

In [6]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average="weighted")
recall = recall_score(y_test, y_pred,  average="weighted")
f1 = f1_score(y_test, y_pred,  average="weighted")

In [7]:
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F score:", f1)

Accuracy: 0.7666666666666667
Precision: 0.86875
Recall: 0.7666666666666667
F score: 0.7448888888888889


In [8]:
from sklearn.svm import SVC
clf = SVC(gamma='auto')
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)

In [9]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average="weighted")
recall = recall_score(y_test, y_pred,  average="weighted")
f1 = f1_score(y_test, y_pred,  average="weighted")

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F score:", f1)

Accuracy: 0.9333333333333333
Precision: 0.9454545454545454
Recall: 0.9333333333333333
F score: 0.9333333333333333


In [10]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=0)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)

In [11]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average="weighted")
recall = recall_score(y_test, y_pred,  average="weighted")
f1 = f1_score(y_test, y_pred,  average="weighted")

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F score:", f1)

Accuracy: 0.9333333333333333
Precision: 0.9454545454545454
Recall: 0.9333333333333333
F score: 0.9333333333333333


# Home price forecasting

In [13]:
import pandas as pd

train = pd.read_csv('../data/house_price/train.csv')
X = train[['GrLivArea','YearBuilt']].values
y = train[['SalePrice']].values
X_train, X_test, y_train, y_test = scratch_train_test_split(X,y,train_size=0.8,random_state=0)

In [15]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train_std = scaler.transform(X_train)
X_test_std = scaler.transform(X_test)

In [16]:
from sklearn.linear_model import SGDRegressor
reg = SGDRegressor()
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test_std)

In [17]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
mse = mean_squared_error(y_test,y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test,y_pred)

In [18]:
print(rmse)
print(r2)

1652069799213.9841
-554134274318562.4
