In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split , cross_val_score
from sklearn import ensemble , tree , linear_model
from sklearn.metrics import r2_score ,  mean_squared_error

In [2]:

x = pd.read_csv("../tsrl10000.csv")
x = x.iloc[:,1:]

y = pd.read_csv("../srlbininfo_10000",sep ="\t",header = None)
y = pd.DataFrame(y.iloc[:,1])
y.columns = ["RL"]

df = pd.concat([y,x],axis =1)

df = df[df.RL < 25] #Ignore values more than 25

y = pd.DataFrame(df.iloc[:,0])
x = df.iloc[:,1:]
x.shape, y.shape , df.shape

((9977, 3880), (9977, 1), (9977, 3881))

In [49]:
x.shape, y.shape , df.shape

((9977, 3880), (9977, 1), (9977, 3881))

## Split the data to training and Testing Set

In [3]:
x_train, x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2) # 80 Training 20 test
x_train.shape,y_train.shape, x_test.shape , y_test.shape

((7981, 3880), (7981, 1), (1996, 3880), (1996, 1))

# Using Multiple Linear Regression

In [52]:
lr = linear_model.LinearRegression().fit(x_train,y_train)
lr

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [53]:
print(lr.coef_)
print(lr.intercept_)

[[0.23943922 1.14416399 0.1957286  ... 0.         0.         0.        ]]
[-2223.6613302]


In [55]:
y_pred = lr.predict(x_test)

In [56]:
print("R2 Score for train set :",lr.score(x_train,y_train))
print("R2 score for test Set",lr.score(x_test,y_test))
print("R2 score for test Set",lr.score(x_test,y_pred))

R2 Score for train set : 0.8787929449172964
R2 score for test Set 0.6787285010424569
R2 score for test Set 1.0


In [57]:
y_pred[:10]

array([[ 8.65688538],
       [15.25360688],
       [10.68598668],
       [ 8.93883004],
       [ 6.64262769],
       [ 8.75296059],
       [ 2.58275619],
       [ 6.82335782],
       [11.88437117],
       [ 5.84134426]])

In [58]:
y_test[:10]

Unnamed: 0,RL
1349,9.187994
6586,18.074515
7828,8.79
3030,7.903805
2572,7.898488
944,10.56785
4110,5.851232
8550,4.8
9906,13.457643
2706,6.44


### Accuracy Only 64.8 % with Linear Regression

In [36]:
#cv_linear = cross_val_score(estimator = lr, X= x_train , y = y_train , cv =10)

#Predicting R2  Score the Train set results
y_pred_linear_train = lr.predict(x_train)
r2_score_linear_train = r2_score(y_train , y_pred_linear_train)

#Predicting R2Scorethe Test set results
y_pred_linear_test = lr.predict(x_test)
r2_score_linear_test = r2_score(y_test, y_pred_linear_test)

#Predicting RMSE the Test set results_test
rmse_linear = (np.sqrt(mean_squared_error(y_test , y_pred_linear_test)))

#print("CV --> ",cv_linear.mean())
print("R2 Score (train)" , r2_score_linear_train)
print("R2 Score test", r2_score_linear_test)
print("RMSE",rmse_linear)

R2 Score (train) 0.8659768694813629
R2 Score test 0.6482766211775475
RMSE 2.484333784692709


### Polynomial  Regression -2nd Degree

In [77]:
from sklearn.preprocessing import PolynomialFeatures

#poly_reg = PolynomialFeatures(degree =2)
#x_poly = poly_reg.fit_transform(x_train)
#poly_reg.fit(x_poly, y_train)
#regressor_poly2 = LinearRegression()
#regressor_poly2.fit(x_poly,y_train)

## Ridge Regression

In [80]:
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures

steps = [
    ("scalar", StandardScaler()),
    ("poly",PolynomialFeatures(degree =2)),
    ("model" , Ridge(alpha = 0.1 , fit_intercept=True))
]
ridge_pipe = Pipeline(steps)
#ridge_pipe.fit(x_train, y_train)

In [82]:
#ridge_pipe.fit(x_train, y_train)

## Lasso also Cause Memory Error

In [84]:
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures

steps = [
    ("scalar", StandardScaler()),
    ("poly",PolynomialFeatures(degree =2)),
    ("model",Lasso(alpha = 0.012, fit_intercept=True , max_iter=3000))
]

lasso_pipe = Pipeline(steps)
#lasso_pipe.fit(x_train, y_train)

## Suport Vector  Regression 

In [87]:
import numpy as np

In [91]:
from sklearn.preprocessing import StandardScaler
sc_x = StandardScaler()
sc_y = StandardScaler()

x_scaled = sc_x.fit_transform(x_train)
y_scaled = sc_y.fit_transform(y_train.values.reshape(-1,1))

#### Fitting the SVR model to the data set

In [93]:
from sklearn.svm import SVR
regressor_svr = SVR(kernel = "rbf", gamma = "scale")
regressor_svr.fit(x_scaled, y_scaled.ravel())

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [94]:
from sklearn.metrics import r2_score

#Predicting Cross Validation Score 
cv_svr = cross_val_score(estimator=regressor_svr , X= x_scaled , y = y_scaled.ravel(), cv = 10)

#Predicting R2 Scorethe Train
y_pred_svr_train = sc_y.inverse_transform(regressor_svr.predict(sc_x.transform(x_test)))
r2_score_svr_train = r2_score(y_train, y_pred_svr_train)

#Predicting R2 Scoret Test set
y_pred_svr_test = sc_y.inverse_transform(regressor_svr.predict(sc_x.transform(x_test)))
r2_score_svr_test = r2_score(y_test ,y_pred_svr_test)

# Predicting RMSE the Test set Results
rmse_svr = (np.sqrt(mean_squared_error(y_test, y_pred_svr_test)))

print("CV",cv_svr.mean())
print("R2_Score (train)",r2_score_svr_train)
print("R2 Score test",r2_score_svr_test)

print("RMSE",rmse_svr)

ValueError: Found input variables with inconsistent numbers of samples: [7981, 1996]

### Decision Tree Regression

In [98]:
#Fitting the Decision Treee Regression Model to the data set
from sklearn.tree import DecisionTreeRegressor
regressor_dt = DecisionTreeRegressor(random_state = 0)
regressor_dt.fit(x_train , y_train)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=0, splitter='best')

In [59]:
from sklearn.metrics import r2_score

#Predicting Cross Validation Score
cv_dt = cross_val_score(estimator = regressor_dt , x = x_train , y = y_train , cv =10)

#Predicting R2 Score the Train set results
y_pred_dt_train = regressor_dt.predict(x_train)
r2_score_dt_train = r2_score(y_)

NameError: name 'regressor_dt' is not defined