# MACHINE LEARNING MODELS

# REGRESSORS

# Simple Linear Regression

## Method

In [2]:
# Importing the Tools

import pandas as pd  
import numpy as np  
import matplotlib.pyplot as plt  
import seaborn as seabornInstance 
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn import metrics
%matplotlib inline

In [None]:
# Declare the X and y
X = df[['Avg. Area Income', 'Avg. Area House Age', 'Avg. Area Number of Rooms','Avg. Area Number of Bedrooms','Area Population']]
y = df['Price']

# Prepare the test / train sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Get the sets size
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

# instantiate
linreg = LinearRegression()

# fit the model to the training data (learn the coefficients)
linreg.fit(X_train, y_train)

# print the intercept and coefficients
print(linreg.intercept_)
print(linreg.coef_)

#printing the output and coefficients
coeff_df = pd.DataFrame(linreg.coef_,X.columns,columns=['Coefficient']) 
coeff_df


In [None]:
# Visualisation of the errors

# Plotting the predictions vs the test set
y_pred = lm.predict(X_test)  
plt.scatter(y_test,y_pred)

# Plotting the errors
sns.distplot((y_test-y_pred),bins=50)

## Validation Metrics

In [None]:
Mean Absolute Error (MAE) is the mean of the absolute value of the errors
Mean Squared Error (MSE) is the mean of the squared errors:
Root Mean Squared Error (RMSE) is the square root of the mean of the squared errors

Comparing these metrics:

MAE is the easiest to understand because it’s the average error.
MSE is more popular than MAE because MSE “punishes” larger errors, which tends to be useful in the real world.
RMSE is even more popular than MSE because RMSE is interpretable in the “y” units.

In [None]:
# to get the metrics
from sklearn import metrics

print('MAE:', metrics.mean_absolute_error(y_test, y_pred)) 
print('MSE:', metrics.mean_squared_error(y_test, y_pred)) 
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred))) 


# Different way, just for illustration
y_pred = linreg.predict(X_test) 

from sklearn.metrics import mean_squared_error
MSE = mean_squared_error(y_test, y_pred)
print(MSE)

In [None]:
# To get the summary
linreg.summary()


## Predictions

In [None]:
# Lets say that the model inputs are
X = df[['Weight', 'Volume']]
y = df['CO2']

regr = linear_model.LinearRegression()
regr.fit(X, y)

# Simply do that for predicting the CO2 emission of a car where the weight is 2300kg, and the volume is 1300ccm:
predictedCO2 = regr.predict([[2300, 1300]])

print(predictedCO2)

# Multiple Linear Regression

In [None]:
Almost all the real-world problems that you are going to encounter will have more than two variables. 
Linear regression involving multiple variables is called “multiple linear regression” or multivariate linear regression. 
The steps to perform multiple linear regression are almost similar to that of simple linear regression. 

The difference lies in the evaluation. 
You can use it to find out which factor has the highest impact on the predicted output and how different variables relate to each other.

# Logistic Regression

## Method

In [None]:
from sklearn.linear_model import LogisticRegression

# Logistic Regression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
acc_log = round(logreg.score(X_train, y_train) * 100, 2)
acc_log

## Validation

### Correlation coeff

In [None]:
coeff_df = pd.DataFrame(stand_dfX.columns.delete(0))
coeff_df.columns = ['Feature']
coeff_df["Correlation"] = pd.Series(logreg.coef_[0])

coeff_df.sort_values(by='Correlation', ascending=False)

### Metrics

In [None]:
y_pred = logreg.predict(X_test) 

from sklearn import metrics

MSE = metrics.mean_squared_error(y_test, y_pred)
RSq = metrics.r2_score(y_test, y_pred)
print(MSE)
print(RSq)

### K-fold

In [None]:
from sklearn import model_selection
import warnings
warnings.filterwarnings("ignore")

kfold = model_selection.KFold(n_splits=10, random_state=100)
model_kfold = LogisticRegression()
results_kfold = model_selection.cross_val_score(model_kfold, X_test, y_test, cv=kfold)

print("Accuracy: %.2f%%" % (results_kfold.mean()*100.0))
print(results_kfold)

# KNN Regression

## Method

In [None]:
from sklearn import neighbors

knn = neighbors.KNeighborsRegressor(n_neighbors = 10)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
acc_knn = round(knn.score(X_train, y_train) * 100, 2)
acc_knn

# CLASSIFIERS

# KNN Classifier

## Method

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# KNN Classifier
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, y_train)
Y_pred = knn.predict(X_test)
acc_knn = round(knn.score(X_train, y_train) * 100, 2)
acc_knn

## Optimization: find best n

In [None]:
error_rate = []

for i in range(1,10):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train,y_train)
    pred_i = knn.predict(X_test)
    error_rate.append(np.mean(pred_i != y_test))

In [None]:
plt.figure(figsize=(10,6))

plt.plot(range(1,10),error_rate,color='blue', linestyle='dashed', marker='o',
         markerfacecolor='red', markersize=10)

plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')

# DECISION TREES