## Linear Predictions
### Predicting weight from height 

In [1]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

We use a publicly available dataset of a range of weights and heights for 25,000 individuals who were at age 18 at the time of measurement.



In [None]:
df=pd.read_csv("SOCR-HeightWeight.csv")
df

Unfortunately, this dataset was assembled in the US thus the weights are in pounds and the heights are in inches. Thus, our first step after loading the dataset is to convert the units to kg and cm. 

In [None]:
df["Height(cm)"]=df["Height(Inches)"]*2.54
df["Weight(kg)"]=df["Weight(Pounds)"]*0.453592

df

## Looking at the data 


In [None]:
X = df.iloc[:,3:4]  
X

In [None]:
y = df.iloc[:,4:5]
y

In [None]:
plt.scatter(X,y,color='blue')
plt.title("")
plt.xlabel("Height (cm)")
plt.ylabel("Weight (kg)")
plt.show()

## Building and evaluating a linear model 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=500,random_state = 0)


In [None]:
regressor = LinearRegression()
regressor.fit(X_train,y_train)
y_pred = regressor.predict(X_train)

## Looking at the fit 

In [None]:
plt.scatter(X_train,y_train,color='red')
plt.plot(X_train, regressor.predict(X_train),color='blue')
plt.title('Training Set')
plt.xlabel('Height')
plt.ylabel('Weight')
plt.show()

### Prediction on the test set

In [None]:
plt.scatter(X_test,y_test,color='red')
plt.plot(X_test, regressor.predict(X_test),color='blue')
plt.title('Test Set')
plt.xlabel('Height')
plt.ylabel('Weight')
plt.show()

### How good is the prediction? 



In [None]:
errors = y_test - regressor.predict(X_test)

print(errors)

errors.plot.density(color='green')
plt.title('Error Values')
plt.show()



In [None]:
abserrors = np.abs(errors)


abserrors

In [None]:
abserrors.mean()

### Can a different type of model give a better prediction? 



In [None]:
#from sklearn.linear_model import Perceptron
from sklearn import svm

regr = svm.SVR()
regr.fit(X_train, y_train)


plt.scatter(X_test,y_test,color='red')
plt.plot(X_test, regr.predict(X_test),color='blue')
plt.title('Test Set')
plt.xlabel('Height')
plt.ylabel('Weight')
plt.show()

In [None]:
errors2 = y_test.values - regr.predict(X_test)
errors2 = errors2[0]

pd.Series(errors2).plot.density(color='green')
plt.title('Error Values')
plt.show()

In [None]:
#abserrors2 = np.abs(errors2)

linearerrors  = [item for sublist in errors.values for item in sublist]

#linearerrors
svmerrors = list(errors2)

svmerrors

dferrors = pd.DataFrame({"linear":linearerrors,"SVM":svmerrors})

print(dferrors)

dferrors.boxplot()



In [None]:
dfabserrors = dferrors.abs()

dfabserrors.boxplot()

In [None]:
dfabserrors.mean()