# Concepts

### Supervised learning
 - Automate time-consuming or expensive manual tasks
 - Make predictions about the future
 - Need labeled data
   - Historical data with labels
   - Experiments to get labeled data
   - Crowd-sourcing labeled data

In [None]:
from sklearn import datasets
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')
iris = datasets.load_iris()
type(iris)

In [None]:
print(iris.keys())

In [None]:
type(iris.data), type(iris.target)

In [None]:
iris.data.shape

In [None]:
iris.target_names

# EDA

In [None]:
import pandas as pd
x = iris.data
y = iris.target
df = pd.DataFrame(x, columns=iris.feature_names)
df.head(3)

In [None]:
_ = pd.plotting.scatter_matrix(df, c=y, figsize=[8,8], s=150, marker='D')

# Viz

In [None]:
import pandas as pd
import seaborn as sns
df = pd.DataFrame({'category':[1,0,0,1], 'boolean1':[0,1,0,1], 'boolean2':[0,1,1,1], 'boolean3':[1,1,1,0]})

plt.figure()
sns.countplot(x='boolean2', hue='category', data=df, palette='RdBu')
plt.xticks([0,1], ['No', 'Yes'])
plt.show()

# k-NN

### Concepts
 - Basic idea: Predict the label of a data point by
   - looking at 'k' closest labeld data points
   - Taking a majority vote

### Scikit-learn fit and predict
 - All machine learning models implemented as python classes
   - They implement models implemented as python classes
   - Store the information learned from the data
 - Training a model on the data='fitting' a model to the data
   - .fit() method
 - To predict the labels of new data: .predict() method

#### Fit classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=6)
knn.fit(iris['data'], iris['target'])

In [None]:
iris['data'].shape

In [None]:
iris['target'].shape

#### Predict on unlabeled data

In [None]:
x_new = np.array([[5.6, 2.8, 3.9, 1.1],
                  [5.7, 2.6, 3.8, 1.3],
                  [4.7, 3.2, 1.3, 0.2]])
prediction = knn.predict(x_new)
x_new.shape

In [None]:
print('Prediction: {}'.format(prediction))

# k-NN Case

#### Fit k-NN model

In [None]:
from sklearn.neighbors import KNeighborsClassifier

df = pd.DataFrame({'category':['a','b','b','b','b','a'], 
                   'boolean1':[0,1,0,1,0,0], 
                   'boolean2':[0,1,1,1,1,1], 
                   'boolean3':[1,1,1,0,1,0]})

y = df['category'].values
X = df.drop('category', axis=1).values

knn = KNeighborsClassifier(n_neighbors=6)
knn.fit(X, y)

#### Predict

In [None]:
X_new = pd.DataFrame([[0,0,0]])

y_pred = knn.predict(X)
print("Prediction: {}".format(y_pred))

new_prediction = knn.predict(X_new)
print("Prediction: {}".format(new_prediction))

# k-NN Measuring model performance
 - Could compute accuracy on data used to fit classifier
 - NOT indicative of ability to generalize
 - Split data into training and test set
 - Fit/train the classifier on the training set
 - Make predictions on test set
 - Compare predictions with the known labels

In [None]:
from sklearn import datasets
from sklearn.model_selection import train_test_split

iris = datasets.load_iris()
X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=21, stratify=y)
knn = KNeighborsClassifier(n_neighbors=8)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

print("Test set predictions:", "\n", y_pred)
#print(\\n {}\".format(y_pred))

# k-NN Model Complexity
 - Larger k = smoother decision boundary = less complex model = underfitting
 - Smaller k = more complex model = can lead to overfitting = overfitting

# Regression

In [None]:
import pandas as pd 
boston = pd.read_csv('../TIL_data_git/Boston.csv', index_col=0)
print(boston.shape)
boston.head(2)

#### create feature and target arrays

In [None]:
X = boston.drop('medv', axis=1).values
y = boston['medv'].values

X_rooms = X[:,5]
type(X_rooms), type(y)

In [None]:
y = y.reshape(-1,1)
X_rooms = X_rooms.reshape(-1,1)

y.shape, X_rooms.shape

#### plot house value & #rooms

In [None]:
plt.scatter(X_rooms, y)
plt.ylabel('value of house')
plt.xlabel('# rooms')
plt.show()

#### Fit a regression model

In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression

reg = LinearRegression()
reg.fit(X_rooms, y)
prediction_space = np.linspace(min(X_rooms), max(X_rooms)).reshape(-1,1)


In [None]:
plt.scatter(X_rooms, y, color='blue')
plt.plot(prediction_space, reg.predict(prediction_space), color='black', linewidth=3)
plt.show()

# Regression case

In [None]:
# Import numpy and pandas
import numpy as np
import pandas as pd

# Read the CSV file into a DataFrame: df
df = pd.read_csv('../TIL_data_git/gapminder2.csv')

# Create arrays for features and target variable
y = df['life']
X = df['fertility']

# Print the dimensions of y and X before reshaping
print("Dimensions of y before reshaping: ", y.shape)
print("Dimensions of X before reshaping: ", X.shape)

# # Reshape X and y
# y_reshaped = y.reshape(-1,1)
# X_reshaped = X.reshape(-1,1)

# Print the dimensions of y_reshaped and X_reshaped
print("Dimensions of y after reshaping: ", y.shape)
print("Dimensions of X after reshaping: ", X.shape)

In [None]:
import seaborn as sns
sns.heatmap(df.corr(), square=True, cmap='RdYlGn')

In [None]:
df.info()

In [None]:
df['life'].describe()

# Regression - Mechanics
 - Define an error functions for any given line
 - Choose the line that minimize the error function
 - Ordinary least squares(OLS): Minimize sum of squares of residuals

# Regression Template code

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=42)
reg_all = LinearRegression()

reg_all.fit(X_train, y_train)
reg_all.fit(X_train, y_train)
y_pred = reg_all.predict(X_test)
reg_all.score(X_test, y_test)

# tmp

In [None]:
import numpy as np
## population parameters
beta0 = 1
beta1 = -2
beta2 = 6
beta3 = -1
sigma = 2

np.random.seed(7890)

## training data
x = np.arange(0 , 5 , 0.05)
f_x = beta0 + beta1 * x + beta2 * x**2 + beta3 * x**3
epsilon = np.random.normal(loc=0, scale=sigma, size=100)
y = f_x + epsilon

## test data
x_test = np.arange(0 , 6, 0.1)
f_x_test = beta0 + beta1 * x_test + beta2 * x_test**2 + beta3 * x_test**3
epsilon_test = np.random.normal(loc=0, scale=sigma, size=len(x_test))
y_test = f_x_test + epsilon_test

In [None]:
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf

def udf_mse(y_true, y_pred):
    y_true = list(y_true)   # control edge case: unify the type
    y_pred = list(y_pred)   # control edge case: unify the type
    return(np.square(np.subtract(y_true,y_pred)).mean())

df = pd.DataFrame({'y':y, 'x':x})
model_one = smf.ols('y ~ x', data=df).fit()

print('-------------------------------------')
print('** parameters in model_one **')
print(model_one.params)

# # You can uncomment the code to see the summary of model
# display(model_one.summary())

x_df = df[['x']]
y_pred = model_one.predict(x_df)
y_true = y

y_test_pred = model_one.params.Intercept + model_one.params.x * x_test
y_test_true = y_test

print('-------------------------------------')
print('** Training and Test MSE **')
print('train data MSE:\t {:>15}\t'.format(udf_mse(y_pred, y_true)))
print('test data MSE:\t {:>15}\t'.format(udf_mse(y_test_pred, y_test_true)))
print('-------------------------------------')

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

df = pd.DataFrame({'y':y, 'x':x})
X = df.drop('y',axis=1).values
y = df['y'].values

reg = LinearRegression()
reg.fit(X, y)

print('-------------------------------------')
print('** parameters in model_one **')
print('Intercept:\t {:>15}\t'.format(reg.intercept_))
print('x_coefficient:\t {:>15}\t'.format(reg.coef_[0]))

y_true = y
y_pred = reg.predict(X)

y_test_true = y_test
x_test = x_test.reshape(-1,1)
y_test_pred = reg.predict(x_test)

mse_train = mean_squared_error(y_true,y_pred)
mse_test = mean_squared_error(y_test_true,y_test_pred)

print('-------------------------------------')
print('** Training and Test MSE **')
print('train data MSE:\t {:>15}\t'.format(mse_train))
print('test data MSE:\t {:>15}\t'.format(mse_test))
print('-------------------------------------')

# # You can uncomment the code to see the visualization
# prediction_space = np.linspace(min(X), max(X)).reshape(-1,1)
# plt.scatter(X, y, color='blue')
# plt.plot(prediction_space, reg.predict(prediction_space), color='black', linewidth=3)
# plt.show()


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

df = pd.DataFrame({'y':y, 'x':x})
X = df.drop('y',axis=1).values
y = df['y'].values

reg = LinearRegression()
reg.fit(X, y)

print('-------------------------------------')
print('** parameters in model_one **')
print('Intercept:\t {:>15}\t'.format(reg.intercept_))
print('x_coefficient:\t {:>15}\t'.format(reg.coef_[0]))

y_true = y
y_pred = reg.predict(X)

y_test_true = y_test
x_test = x_test.reshape(-1,1)
y_test_pred = reg.predict(x_test)

mse_train = mean_squared_error(y_true,y_pred)
mse_test = mean_squared_error(y_test_true,y_test_pred)

print('-------------------------------------')
print('** Training and Test MSE **')
print('train data MSE:\t {:>15}\t'.format(mse_train))
print('test data MSE:\t {:>15}\t'.format(mse_test))
print('-------------------------------------')

# # You can uncomment the code to see the visualization
# prediction_space = np.linspace(min(X), max(X)).reshape(-1,1)
# plt.scatter(X, y, color='blue')
# plt.plot(prediction_space, reg.predict(prediction_space), color='black', linewidth=3)
# plt.show()

#### regression2 answer

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

df = pd.DataFrame({'y':y, 'x':x})
for i in range(2,9):
    cname = 'x'+str(i)
    df[cname] = [j**i for j in df['x']]

X = df.drop('y',axis=1).values
y = df['y'].values

reg = LinearRegression()
reg.fit(X, y)

print('----------------------------------')
print('** parameters in model_one **')
print('Intercept:',reg.intercept_)
print('x_coefficient:', '\n', reg.coef_)

y_true = y
y_pred = reg.predict(X)


df_test = pd.DataFrame({'y':y_test, 'x':x_test})
for i in range(2,9):
    cname = 'x'+str(i)
    df_test[cname] = [j**i for j in df_test['x']]


y_test_true = y_test                        #array
x_test = df_test.drop('y', axis=1).values   #array
y_test_pred = reg.predict(x_test)

mse_train = mean_squared_error(y_true,y_pred)
mse_test = mean_squared_error(y_test_true,y_test_pred)

print('-------------------------------------')
print('** Training and Test MSE **')
print('train data MSE:\t {:>15}\t'.format(mse_train))
print('test data MSE:\t {:>15}\t'.format(mse_test))
print('-------------------------------------')

In [2]:
import numpy as np
## population parameters
beta0 = 1
beta1 = -2
beta2 = 6
beta3 = -1
sigma = 2

np.random.seed(7890)

## training data
x = np.arange(0 , 5 , 0.05)
f_x = beta0 + beta1 * x + beta2 * x**2 + beta3 * x**3
epsilon = np.random.normal(loc=0, scale=sigma, size=100)
y = f_x + epsilon

## test data
x_test = np.arange(0 , 6, 0.1)
f_x_test = beta0 + beta1 * x_test + beta2 * x_test**2 + beta3 * x_test**3
epsilon_test = np.random.normal(loc=0, scale=sigma, size=len(x_test))
y_test = f_x_test + epsilon_test

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

df = pd.DataFrame({'y':y, 'x':x})
for i in range(2,9):
    cname = 'x'+str(i)
    df[cname] = [j**i for j in df['x']]

X = df.drop('y',axis=1).values
y = df['y'].values

reg = LinearRegression()
reg.fit(X, y)

print('----------------------------------')
print('** parameters in model_one **')
print('Intercept:',reg.intercept_)
print('x_coefficient:', '\n', reg.coef_)

y_true = y
y_pred = reg.predict(X)


df_test = pd.DataFrame({'y':y_test, 'x':x_test})
for i in range(2,9):
    cname = 'x'+str(i)
    df_test[cname] = [j**i for j in df_test['x']]


y_test_true = y_test                        #array
x_test = df_test.drop('y', axis=1).values   #array
y_test_pred = reg.predict(x_test)

mse_train = mean_squared_error(y_true,y_pred)
mse_test = mean_squared_error(y_test_true,y_test_pred)

print('-------------------------------------')
print('** Training and Test MSE **')
print('train data MSE:\t {:>15}\t'.format(mse_train))
print('test data MSE:\t {:>15}\t'.format(mse_test))
print('-------------------------------------')

----------------------------------
** parameters in model_one **
Intercept: 0.4476474416374572
x_coefficient: 
 [-5.07623714e+00  2.69291016e+01 -4.32305478e+01  3.90613579e+01
 -1.88667252e+01  4.92058110e+00 -6.56484560e-01  3.51275645e-02]
-------------------------------------
** Training and Test MSE **
train data MSE:	 4.217720841560748	
test data MSE:	 1855.4709460860242	
-------------------------------------


In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

df = pd.DataFrame({'y':y, 'x':x})
for i in range(2,8):
    cname = 'x'+str(i)
    df[cname] = [j**i for j in df['x']]

X = df.drop('y',axis=1).values
y = df['y'].values

reg = LinearRegression()
reg.fit(X, y)

print('----------------------------------')
print('** parameters in model_one **')
print('Intercept:',reg.intercept_)
print('x_coefficient:', '\n', reg.coef_)

y_true = y
y_pred = reg.predict(X)


df_test = pd.DataFrame({'y':y_test, 'x':x_test})
for i in range(2,9):
    cname = 'x'+str(i)
    df_test[cname] = [j**i for j in df_test['x']]


y_test_true = y_test                        #array
x_test_arr = df_test.drop('y', axis=1).values   #array
y_test_pred = reg.predict(x_test_arr)

mse_train = mean_squared_error(y_true,y_pred)
mse_test = mean_squared_error(y_test_true,y_test_pred)

print('-------------------------------------')
print('** Training and Test MSE **')
print('train data MSE:\t {:>15}\t'.format(mse_train))
print('test data MSE:\t {:>15}\t'.format(mse_test))
print('-------------------------------------')

----------------------------------
** parameters in model_one **
Intercept: -0.288315590936179
x_coefficient: 
 [  7.86633959 -21.34821668  30.00090629 -17.09300686   4.882698
  -0.69536803   0.03904122]


Exception: Data must be 1-dimensional

In [5]:
pd.DataFrame({'y':y_test, 'x':x_test})

Exception: Data must be 1-dimensional