In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
from sklearn import linear_model, tree, ensemble

In [5]:
train_data = pd.read_csv('tests.csv')

y = train_data.FINAL # Target variable             

X = train_data[['EXAM1', 'EXAM2', 'EXAM3']]


print("Shape of input data: {} and shape of target variable: {}".format(X.shape, y.shape))

X.head() # Show first 5 training examples

Shape of input data: (25, 3) and shape of target variable: (25,)


Unnamed: 0,EXAM1,EXAM2,EXAM3
0,73,80,75
1,93,88,93
2,89,91,90
3,96,98,100
4,73,66,70


In [6]:
train_data.shape

(25, 4)

In [7]:
test_data = pd.read_csv('tests.csv')
print(test_data.shape)
test_data.head()


(25, 4)


Unnamed: 0,EXAM1,EXAM2,EXAM3,FINAL
0,73,80,75,152
1,93,88,93,185
2,89,91,90,180
3,96,98,100,196
4,73,66,70,142


In [11]:
#test_data.drop(['LotFrontage', 'GarageYrBlt', 'MasVnrArea'], axis=1, inplace=True) # Remove columns with null values

# Select numeric columns only
numeric_cols = [cname for cname in train_data.columns if train_data[cname].dtype in ['int64', 'float64']]
test_x= train_data[numeric_cols].copy()

In [12]:
print("Shape of input data: {}".format(test_x.shape))
test_x.head() # Show first 5 training examples

Shape of input data: (25, 4)


Unnamed: 0,EXAM1,EXAM2,EXAM3,FINAL
0,73,80,75,152
1,93,88,93,185
2,89,91,90,180
3,96,98,100,196
4,73,66,70,142


In [14]:
# split into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size = 0.2, random_state=1)

In [15]:
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()
linreg.fit(X_train, y_train)

LinearRegression()

In [16]:
linreg.score(X_train, y_train)

0.990243642976611

In [17]:
print ("iNTERCEPT : ",linreg.intercept_)
print ("CO-EFFICIENT : ",linreg.coef_)

iNTERCEPT :  -5.578811381230452
CO-EFFICIENT :  [0.31966081 0.5601746  1.20480901]


In [18]:
y_pred = linreg.predict(X_test)

In [19]:
from sklearn.metrics import r2_score
from sklearn import metrics
print("R-Square Value",r2_score(y_test,y_pred))
#print("\n")
print ("mean_absolute_error :",metrics.mean_absolute_error(y_test, y_pred))
#print("\n")
print ("mean_squared_error : ",metrics.mean_squared_error(y_test, y_pred))
#print("\n")
print ("root_mean_squared_error : ",np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

R-Square Value 0.9798108497613987
mean_absolute_error : 1.777804147657014
mean_squared_error :  6.7625577639219
root_mean_squared_error :  2.6004918311584637


In [20]:
from sklearn.model_selection import cross_val_score
#cv_4_results = cross_val_score(linreg, X, y, cv=4, scoring= "neg_mean_squared_error")
cv_4_results = cross_val_score(linreg, X, y, cv=4)

cv_4_results

array([0.99261819, 0.99101957, 0.97337368, 0.93402313])

In [21]:
# Lets split the data into 5 folds.  
# We will use this 'kf'(KFold splitting stratergy) object as input to cross_val_score() method
kf =KFold(n_splits=5, shuffle=True, random_state=42)

cnt = 1
# split()  method generate indices to split data into training and test set.
for train_index, test_index in kf.split(X, y):
    print(f'Fold:{cnt}, Train set: {len(train_index)}, Test set:{len(test_index)}')
    cnt += 1

Fold:1, Train set: 20, Test set:5
Fold:2, Train set: 20, Test set:5
Fold:3, Train set: 20, Test set:5
Fold:4, Train set: 20, Test set:5
Fold:5, Train set: 20, Test set:5


In [22]:
def rmse(score):
    rmse = np.sqrt(-score)
    print(f'rmse= {"{:.2f}".format(rmse)}')

### Using Logistic Regression <a id ="13"></a>

In [32]:
score = cross_val_score(linear_model.LinearRegression(), X, y, cv= kf, scoring="neg_mean_squared_error")
print(f'Scores for each fold: {score}')
rmse(score.mean())

Scores for each fold: [ -4.60572355 -18.45823605 -12.77856958  -0.97162363  -5.23799328]
rmse= 2.90


In [33]:
train_data = pd.read_csv("https://raw.githubusercontent.com/SahilSinhaLpu/Machine-Learning/master/Datasets/SomvervilleHappines.csv")

# Remove rows with missing target values
#train_data.dropna(axis=0, subset=['Survived'], inplace=True)
y = train_data.D # Target variable             
train_data.drop(['D'], axis=1, inplace=True) # Removing target variable from training data

#train_data.drop(['Age'], axis=1, inplace=True) # Remove columns with null values

# Select numeric columns only
numeric_cols = [cname for cname in train_data.columns if train_data[cname].dtype in ['int64', 'float64']]
X = train_data[numeric_cols].copy()

print("Shape of input data: {} and shape of target variable: {}".format(X.shape, y.shape))
pd.concat([X, y], axis=1).head() # Show first 5 training examples

Shape of input data: (143, 6) and shape of target variable: (143,)


Unnamed: 0,X1,X2,X3,X4,X5,X6,D
0,3,3,3,4,2,4,0
1,3,2,3,5,4,3,0
2,5,3,3,3,3,5,1
3,5,4,3,3,3,5,0
4,5,4,3,3,3,5,0


In [34]:
# split into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size = 0.2, random_state=1)

In [35]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(X_train,y_train)
clf.score(X_train,y_train)

0.6491228070175439

In [36]:
#clf.score(x_train)
clf.score(X_test,y_test)

0.4827586206896552

In [37]:
# Lets split the data into 5 folds. 
# We will use this 'kf'(StratiFiedKFold splitting stratergy) object as input to cross_val_score() method
# The folds are made by preserving the percentage of samples for each class.
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cnt = 1
# split()  method generate indices to split data into training and test set.
for train_index, test_index in kf.split(X, y):
    print(f'Fold:{cnt}, Train set: {len(train_index)}, Test set:{len(test_index)}')
    cnt+=1
    
# Note that: 
# cross_val_score() parameter 'cv' will by default use StratifiedKFold spliting startergy if we just specify value of number of folds. 
# So you can bypass above step and just specify cv= 5 in cross_val_score() function

Fold:1, Train set: 114, Test set:29
Fold:2, Train set: 114, Test set:29
Fold:3, Train set: 114, Test set:29
Fold:4, Train set: 115, Test set:28
Fold:5, Train set: 115, Test set:28


In [38]:
score = cross_val_score(linear_model.LogisticRegression(random_state= 42), X, y, cv= kf, scoring="accuracy")
print(f'Scores for each fold are: {score}')
print(f'Average score: {"{:.2f}".format(score.mean())}')

Scores for each fold are: [0.62068966 0.44827586 0.65517241 0.57142857 0.67857143]
Average score: 0.59


In [39]:
algorithms = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']

for algo in algorithms:
    score = cross_val_score(linear_model.LogisticRegression(max_iter= 4000, solver= algo, random_state= 42), X, y, cv= kf, scoring="accuracy")
    print(f'Average score({algo}): {"{:.3f}".format(score.mean())}')
    
# Note, here we are using max_iter = 4000, so that all the solver gets chance to converge. 

Average score(newton-cg): 0.595
Average score(lbfgs): 0.595
Average score(liblinear): 0.574
Average score(sag): 0.595
Average score(saga): 0.588
