In [41]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
from sklearn import linear_model, tree, ensemble

In [42]:
train_data = pd.read_csv('tests.csv')
y = train_data.FINAL # Target variable             
X = train_data[['EXAM1', 'EXAM2', 'EXAM3']]
X.head() 

Unnamed: 0,EXAM1,EXAM2,EXAM3
0,73,80,75
1,93,88,93
2,89,91,90
3,96,98,100
4,73,66,70


In [43]:
train_data.shape

(25, 4)

In [44]:
test_data = pd.read_csv('tests.csv')
print(test_data.shape)
test_data.head()

(25, 4)


Unnamed: 0,EXAM1,EXAM2,EXAM3,FINAL
0,73,80,75,152
1,93,88,93,185
2,89,91,90,180
3,96,98,100,196
4,73,66,70,142


In [45]:
# Select numeric columns only
numeric_cols = [cname for cname in train_data.columns if train_data[cname].dtype in ['int64', 'float64']]
test_x= train_data[numeric_cols].copy()

In [46]:
test_x.head()

Unnamed: 0,EXAM1,EXAM2,EXAM3,FINAL
0,73,80,75,152
1,93,88,93,185
2,89,91,90,180
3,96,98,100,196
4,73,66,70,142


In [47]:
# split into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size = 0.2, random_state=1)

In [48]:
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()
linreg.fit(X_train, y_train)

LinearRegression()

In [49]:
linreg.score(X_train, y_train)

0.990243642976611

In [50]:
print ("iNTERCEPT : ",linreg.intercept_)
print ("CO-EFFICIENT : ",linreg.coef_)

iNTERCEPT :  -5.578811381230452
CO-EFFICIENT :  [0.31966081 0.5601746  1.20480901]


In [51]:
y_pred = linreg.predict(X_test)

In [52]:
from sklearn.metrics import r2_score
from sklearn import metrics
print("R-Square Value",r2_score(y_test,y_pred))
print ("mean_absolute_error :",metrics.mean_absolute_error(y_test, y_pred))
print ("mean_squared_error : ",metrics.mean_squared_error(y_test, y_pred))
print ("root_mean_squared_error : ",np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

R-Square Value 0.9798108497613987
mean_absolute_error : 1.777804147657014
mean_squared_error :  6.7625577639219
root_mean_squared_error :  2.6004918311584637


In [53]:
from sklearn.model_selection import cross_val_score
cv_4_results = cross_val_score(linreg, X, y, cv=4)
cv_4_results

array([0.99261819, 0.99101957, 0.97337368, 0.93402313])

In [54]:
# k = 5 folds.  
kf =KFold(n_splits=5, shuffle=True, random_state=42)
cnt = 1
for train_index, test_index in kf.split(X, y):
    print(f'Fold:{cnt}, Train set: {len(train_index)}, Test set:{len(test_index)}')
    cnt += 1

Fold:1, Train set: 20, Test set:5
Fold:2, Train set: 20, Test set:5
Fold:3, Train set: 20, Test set:5
Fold:4, Train set: 20, Test set:5
Fold:5, Train set: 20, Test set:5


In [55]:
def rmse(score):
    rmse = np.sqrt(-score)
    print(f'rmse= {"{:.2f}".format(rmse)}')

### Using Logistic Regression <a id ="13"></a>

In [56]:
score = cross_val_score(linear_model.LinearRegression(), X, y, cv= kf, scoring="neg_mean_squared_error")
print(f'Scores for each fold: {score}')
rmse(score.mean())

Scores for each fold: [ -4.60572355 -18.45823605 -12.77856958  -0.97162363  -5.23799328]
rmse= 2.90


In [58]:
train_data = pd.read_csv("https://raw.githubusercontent.com/SahilSinhaLpu/Machine-Learning/master/Datasets/SomvervilleHappines.csv")
y = train_data.D        
train_data.drop(['D'], axis=1, inplace=True)
numeric_cols = [cname for cname in train_data.columns if train_data[cname].dtype in ['int64', 'float64']]
X = train_data[numeric_cols].copy()
pd.concat([X, y], axis=1).head() 

Unnamed: 0,X1,X2,X3,X4,X5,X6,D
0,3,3,3,4,2,4,0
1,3,2,3,5,4,3,0
2,5,3,3,3,3,5,1
3,5,4,3,3,3,5,0
4,5,4,3,3,3,5,0


In [59]:
# split into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size = 0.2, random_state=1)

In [60]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(X_train,y_train)
clf.score(X_train,y_train)

0.6491228070175439

In [61]:
clf.score(X_test,y_test)

0.4827586206896552

In [62]:
# k = 5 folds.
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cnt = 1
for train_index, test_index in kf.split(X, y):
    print(f'Fold:{cnt}, Train set: {len(train_index)}, Test set:{len(test_index)}')
    cnt+=1

Fold:1, Train set: 114, Test set:29
Fold:2, Train set: 114, Test set:29
Fold:3, Train set: 114, Test set:29
Fold:4, Train set: 115, Test set:28
Fold:5, Train set: 115, Test set:28


In [63]:
score = cross_val_score(linear_model.LogisticRegression(random_state= 42), X, y, cv= kf, scoring="accuracy")
print(f'Scores for each fold are: {score}')
print(f'Average score: {"{:.2f}".format(score.mean())}')

Scores for each fold are: [0.62068966 0.44827586 0.65517241 0.57142857 0.67857143]
Average score: 0.59


In [64]:
algorithms = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']

for algo in algorithms:
    score = cross_val_score(linear_model.LogisticRegression(max_iter= 10000, solver= algo, random_state= 42), X, y, cv= kf, scoring="accuracy")
    print(f'Average score({algo}): {"{:.3f}".format(score.mean())}')

Average score(newton-cg): 0.595
Average score(lbfgs): 0.595
Average score(liblinear): 0.574
Average score(sag): 0.595
Average score(saga): 0.588
