In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [None]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

### Read the data of weight vs heights of 10000 sampled men

In [None]:
df = pd.read_csv('weight-height.csv', encoding='utf-8')
df.head()

## Analyze the data

Just for fun, let's take a look at the data, and graphic it, so we know what we're working with. 

In [None]:
df.describe()

In [None]:
sns.jointplot(x='Height',y='Weight',data=df, kind="reg")

In [None]:
sns.histplot(df['Height'].dropna())
plt.axvline(df['Height'].median(), color='b', linestyle='--')

### First, set up our initial variables

X is going to be the values we input into the system. It's traditionally capitalized because it's an input (as in f(x) = y ) 
y is going to be the dependent variable, which is what is being predicted. Sklearn needs arrays, so we use `.values.reshape` to convert our Pandas data to arrays.

In [None]:
X = df['Height'].values.reshape(-1,1)
y = df['Weight'].values.reshape(-1,1)

### Setup model 

The `model` variable will now have our precition method.

In [None]:
clf = linear_model.LinearRegression()

### Split our data into four parts

This will create four variables:

* `X_train` - This will have 80 percent of our Weight column (X axis). It's randomly chosen.
* `X_text`  - This will have the remaining 20 percent of the weights to test it against.
* `y_train` - This will have 80 percent of our Height column, randomly selected.
* `y_test`  - this will have the remainign 20 percent of the heights to test against.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

### Let's explore these variables

Let's take a look at each of these.

In [None]:
X_train

In [None]:
len(y_test)

### Setup the Actual Model

This is the "machine learning" part, where we fit our model based on the training data.

In [None]:
clf.fit(X_train, y_train)

### Let's make some predictions with our 20% test data

Using the 20 percent test data we set aside, let's try to see if we can predict weights from heights.

In [None]:
y_pred = clf.predict(X_test)

### Compare the results

For each row, our model make a weight prediction based on the height given in our X_test. Then we compare that to the true data. Hmm, we're a bit off.

In [None]:
df2 = pd.DataFrame({'Actual': y_test.flatten(), 'Predicted': y_pred.flatten()})
df2

### Graph the results

...but not as far off as you think. While not model is perfect, it can be useful in some situations.

In [None]:
df3 = df2.head(15)
df3.plot(kind='bar', figsize=(16,10))
plt.show()

### Calculate our accuracy

Let's see how accurate we were.

In [None]:
my_accuracy = clf.score(X_test,y_test)
my_accuracy