In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# Used for scientific and technical computing.
from scipy import stats
# Scikit-learn is probably the most useful library for machine learning in Python.
# The sklearn library contains a lot of efficient tools for machine learning and statistical modeling
# pip install scikit-learn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [None]:
# Load data
df = pd.read_csv('./student_scores.csv')
print(df.head())
print(df.shape)

Define model by ourself

In [None]:
def linear_formula(m_x, m_slope, m_intercept):
    return m_slope * m_x + m_intercept

In [None]:
# Statistical Functions to find out the values
'''
slope: float
    Slope of the regression line.

intercept: float
    Intercept of the regression line.

rvalue: float
    The Pearson correlation coefficient. The square of rvalue is equal to the coefficient of determination.

pvalue: float
    The p-value for a hypothesis test whose null hypothesis is that the slope is zero,
    using Wald Test with t-distribution of the test statistic. See alternative above for alternative hypotheses.

stderr: float
    Standard error of the estimated slope (gradient), under the assumption of residual normality.

'''

slope, intercept, correlation_coefficient, pvalue, standard_error = stats.linregress(df["Hours"], df["Scores"])


In [None]:
print("Values from stats module")
print("Slope: ", slope)
print("Intercept: ", intercept)
print("Correlation Coefficient: ", correlation_coefficient)

In [None]:
# The map() function executes a specified function for each item in an iterable.
# The item is sent to the function as a parameter.
# A lambda function is a small anonymous function.
plot_line = list(map(lambda x: linear_formula(x, slope, intercept), df["Hours"]))

In [None]:
plt.scatter(df["Hours"], df["Scores"])
plt.plot(df["Hours"], plot_line, 'b')
plt.grid()
plt.title('Scatterplot of hours and scores with stats()')
plt.xlabel('Hours')
plt.ylabel('Scores')
plt.show()

In [None]:
# 6.5 hours
myModel_predicted = linear_formula(6.5, slope, intercept)
print("My Model Predicted Value: ", myModel_predicted)

In [None]:
# Scikit-Learn's linear regression model expects a 2D input,
# Dataframe offering a 1D array if we just extract the values
'''
# reshape(-1) is a line vector, when reshape(-1,1) is a column:
>>> a
array([[ 1.,  2.,  3.],
       [ 4.,  5.,  6.]])
>>> a.shape
(2, 3)
>>> a.reshape(-1)
array([ 1.,  2.,  3.,  4.,  5.,  6.])
>>> a.reshape(-1,1)
array([[ 1.],
       [ 2.],
       [ 3.],
       [ 4.],
       [ 5.],
       [ 6.]])
'''
X = df['Hours'].values.reshape(-1, 1)
Y = df['Scores'].values.reshape(-1, 1)
print(">>>>>>>>>>>> X >>>>>>>>: ")
print(X)
print(">>>>>>>>>>>> Y >>>>>>>>: ")
print(Y)

In [None]:
#  0.2, meaning 20% of the data will be used for the test set, and the remaining 80% for the training set.
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

regressor = LinearRegression()
regressor.fit(x_train, y_train)


In [None]:
print("Values from sklearn module")
print("Slope: ", regressor.coef_)
print("Intercept: ", regressor.intercept_)
print("Correlation Coefficient: ")
print(df.corr())

In [None]:
plt.scatter(x_train, y_train, color='red')
plt.plot(x_train, regressor.predict(x_train), color='blue')

plt.grid()
plt.title('Scatterplot of hours and scores (Training set)')
plt.xlabel('Hours')
plt.ylabel('Scores')
plt.show()

In [None]:
score = regressor.predict([[6.5]])
print("LinearRegression() Model Predicted Value: ", score)


In [None]:
y_pred = regressor.predict(x_test)

df_preds = pd.DataFrame({'x test': x_test.squeeze(), 'Actual': y_test.squeeze(), 'Predicted': y_pred.squeeze()})
print(df_preds)