In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [None]:
df = pd.read_csv("Desktop/Data/Boston.csv")
df.head()

In [None]:
df.describe()

In [None]:
 df.info()

In [None]:
rows = 2
cols = 7

fig, ax = plt.subplots(nrows = rows, ncols = cols, figsize = (16,4))

col = df.columns
index = 0
for i in range(rows):
    for j in range(cols):
        sns.distplot(df[col[index]], ax = ax[i][j])
        index = index + 1
plt.tight_layout()

In [None]:
corrmat = df.corr()
corrmat

In [None]:
def getCorrelatedFeature(corrdata, threshold):
    feature = []
    value = []
    
    for i, index in enumerate(corrdata.index):
        if abs (corrdata[index]) > threshold:
            feature.append(index)
            value.append(corrdata[index])
    df = pd.DataFrame(data = value, index = feature, columns=["Corr Value"])
    return df

In [None]:
threshold = 0.50
corr_value = getCorrelatedFeature(corrmat['medv'], threshold)

In [None]:
corr_value.index.values

In [None]:
correlated_data = df[corr_value.index]

In [None]:
correlated_data.head()

In [None]:
sns.pairplot(correlated_data)
plt.tight_layout()

In [None]:
sns.heatmap(correlated_data.corr(), annot = True, annot_kws = {'size':12})

In [None]:
X = correlated_data.drop(labels=['medv'], axis=1)
y = correlated_data['medv']
X.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0) 

In [None]:
X_train.shape, X_test.shape

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
y_predict = model.predict(X_test)

In [None]:
df = pd.DataFrame(data = [y_predict, y_test])
df.T

In [None]:
from sklearn.metrics import r2_score

In [None]:
correlated_data.columns

In [None]:
score = r2_score(y_test, y_predict)
mae = mean_absolute_error(y_test, y_predict)
mse = mean_squared_error(y_test, y_predict)

print('r2_score:' , score)
print('mae:', mae)
print('mse:', mse)

In [None]:
total_features = []
total_features_name = []
selected_correlation_value = []
r2_scores = []
mae_value = []
mse_value = []

In [None]:
def performance_metrics(features, th, y_true, y_pred):
    score = r2_score(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    
    total_features.append(len(features)-1)
    total_features_name.append(str(features))
    selected_correlation_value.append(th)
    r2_scores.append(score)
    mae_value.append(mae)
    mse_value.append(mse)
    
    metrics_dataframe = pd.DataFrame(data = [total_features_name, total_features, 
                                            selected_correlation_value, r2_scores,
                                            mae_value, mse_value],
                                    index = ['features name', 'feature', 'corr_value',
                                              'r2_score', 'MAE', 'MSE'
                                            ])
    return metrics_dataframe.T

In [None]:
performance_metrics(correlated_data.columns.values, threshold, y_test, y_predict)

In [None]:
rows = 2
cols = 2

fig, ax = plt.subplots(nrows = rows, ncols = cols, figsize = (16,4))

col = correlated_data.columns
index = 0
for i in range(rows):
    for j in range(cols):
        sns.regplot(x = correlated_data[col[index]], y = correlated_data['medv'], ax = ax[i][j])
        index = index + 1
fig.tight_layout()

In [None]:
corrmat['medv']

In [None]:
df = pd.read_csv("Desktop/Data/Boston.csv")

In [None]:
threshold = 0.60
corr_value = getCorrelatedFeature(corrmat['medv'], threshold)

In [None]:
corr_value.index.values

In [None]:
correlated_data = df[corr_value.index]

In [None]:
correlated_data.head()

In [None]:
def get_y_predict(corr_data):
    X = corr_data.drop(labels = ['medv'], axis = 1)
    y = corr_data['medv']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0) 
    model = LinearRegression()
    model.fit(X_train, y_train)
    y_predict = model.predict(X_test)
    return y_predict

In [None]:
y_predict = get_y_predict(correlated_data)

In [None]:
performance_metrics(correlated_data.columns.values, threshold, y_test, y_predict)

In [None]:
corrmat['medv']

In [None]:
df = pd.read_csv("Desktop/Data/Boston.csv")

In [None]:
threshold = 0.70
corr_value = getCorrelatedFeature(corrmat['medv'], threshold)

In [None]:
corr_value.index.values

In [None]:
correlated_data = df[corr_value.index]

In [None]:
correlated_data.head()

In [None]:
y_predict = get_y_predict(correlated_data)
performance_metrics(correlated_data.columns.values, threshold, y_test, y_predict)

In [None]:
df = pd.read_csv("Desktop/Data/Boston.csv")

In [None]:
correlated_data = df[['rm', 'medv']]
correlated_data.head()

In [None]:
y_predict = get_y_predict(correlated_data)
performance_metrics(correlated_data.columns.values, threshold, y_test, y_predict)

In [None]:
df = pd.read_csv("Desktop/Data/Boston.csv")

In [None]:
threshold = 0.40
corr_value = getCorrelatedFeature(corrmat['medv'], threshold)

In [None]:
corr_value.index.values

In [None]:
correlated_data = df[corr_value.index]

In [None]:
correlated_data.head()

In [None]:
y_predict = get_y_predict(correlated_data)
performance_metrics(correlated_data.columns.values, threshold, y_test, y_predict)

In [None]:
df = pd.read_csv("Desktop/Data/Boston.csv")

In [None]:
threshold = 0.0
corr_value = getCorrelatedFeature(corrmat['medv'], threshold)

In [None]:
corr_value.index.values

In [None]:
correlated_data = df[corr_value.index]

In [None]:
y_predict = get_y_predict(correlated_data)
performance_metrics(correlated_data.columns.values, threshold, y_test, y_predict)

In [None]:
from sklearn.model_selection import learning_curve, ShuffleSplit

In [None]:
def plot_learning_curve (estimator, title, X, y, ylim = None, cv = None,
                         n_jobs=None, train_sizes=np.linspace(.1, 1.0, 10)):
    plt.figure()
    plt.title(title)
    plt.xlabel("Training example")
    plt.ylabel("Score")
    
    train_sizes, train_scores, test_scores = learning_curve(
       estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_socres_mean = np.mean(test_scores, axis=1)
    test_socres_std = np.std(test_scores, axis=1)
    
    plt.grid()
    
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha = 0.1,
                    color='r')
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha = 0.1, color='g')
    
    plt.plot(train_sizes, train_scores_mean, 'o-', color='r',
             label = 'Training score')
    plt.plot(train_sizes, test_scores_mean, 'o-', color='g',
             label = 'Cross_validation score')
                     
    plt.legend(loc ='best')
    return plt   

X = correlated_data.drop(labels = ['medv'], axis = 1)
y = correlated_data['medv']

title = 'Learning Curves (Linear Regression)' + str(X.columns.values)

cv = ShuffleSplit(n_splits = 100, test_size = 0.2, random_state = 0)

estimator = LinearRegression()
plot_learning_curve(estimator, title, X, y, ylim=(0.7, 1.01), cv=cv, n_jobs = -1)

plt.show()