# Install scikit-learn


`pip install scikit-learn`

# Load CSV file

In [1]:
import pandas as pd

# load csv file
df = pd.read_csv('demo.csv')

# load csv file without header
#df = pd.read_csv('demo.csv', header=None)

# print dataframe
print(df)

# print dataframe shape
print(df.shape)

# print column
print(df['number_room'].values)

   size  number_room  house_price house_type
0  40.0          3.0          800        old
1  29.0          5.0          700      young
2  33.0          2.0          670      young
3   NaN          2.0          770        old
4   NaN          NaN          870      young
(5, 4)
[ 3.  5.  2.  2. nan]


# Missing value

In [2]:
import pandas as pd

# load csv file
df = pd.read_csv('demo.csv')

print('origin dataframe')
print(df)

print('drop row that contain any missing value')
# drop row that contain any missing value
df_no_missing = df.dropna()
print(df_no_missing)

print('fill missing value with mean')
# fill missing value with mean 
df["size"].fillna(df["size"].mean(), inplace=True)
print(df)

origin dataframe
   size  number_room  house_price house_type
0  40.0          3.0          800        old
1  29.0          5.0          700      young
2  33.0          2.0          670      young
3   NaN          2.0          770        old
4   NaN          NaN          870      young
drop row that contain any missing value
   size  number_room  house_price house_type
0  40.0          3.0          800        old
1  29.0          5.0          700      young
2  33.0          2.0          670      young
fill missing value with mean
   size  number_room  house_price house_type
0  40.0          3.0          800        old
1  29.0          5.0          700      young
2  33.0          2.0          670      young
3  34.0          2.0          770        old
4  34.0          NaN          870      young


# Encoding categorical features

In [3]:
import pandas as pd

# load csv file
df = pd.read_csv('demo.csv')

print('origin dataframe')
print(df)

print('encode category')
df['house_type'] = pd.Categorical(df['house_type']).codes
print(df)

origin dataframe
   size  number_room  house_price house_type
0  40.0          3.0          800        old
1  29.0          5.0          700      young
2  33.0          2.0          670      young
3   NaN          2.0          770        old
4   NaN          NaN          870      young
encode category
   size  number_room  house_price  house_type
0  40.0          3.0          800           0
1  29.0          5.0          700           1
2  33.0          2.0          670           1
3   NaN          2.0          770           0
4   NaN          NaN          870           1


# Change dataframe into numpy arrray

In [5]:
import pandas as pd
import numpy  as np
# load csv file
df = pd.read_csv('demo.csv')

print('change dataframe to numpy array')
numpy_array = np.array(df)
print(numpy_array)

print('change numpy array to dataframe')
df_from_numpy = pd.DataFrame(numpy_array)
print(df_from_numpy)


change dataframe to numpy array
[[40.0 3.0 800 'old']
 [29.0 5.0 700 'young']
 [33.0 2.0 670 'young']
 [nan 2.0 770 'old']
 [nan nan 870 'young']]
change numpy array to dataframe
     0    1    2      3
0   40    3  800    old
1   29    5  700  young
2   33    2  670  young
3  NaN    2  770    old
4  NaN  NaN  870  young


# Split data

In [6]:
import numpy as np
from sklearn.model_selection import train_test_split

x, y = np.arange(20).reshape((10, 2)), np.arange(10)

print('before splitting......')

print("x: {}\n".format(x))
print("y: {}\n".format(y))

print("shape of x: {}".format(x.shape))
print("shape of y: {}\n".format(y.shape))

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
print('after splitting......')

print("x_train: {}\n".format(x_train))
print("x_test: {}\n".format(x_test))

print("y_train: {}\n".format(y_train))
print("y_test: {}\n".format(y_test))

before splitting......
x: [[ 0  1]
 [ 2  3]
 [ 4  5]
 [ 6  7]
 [ 8  9]
 [10 11]
 [12 13]
 [14 15]
 [16 17]
 [18 19]]

y: [0 1 2 3 4 5 6 7 8 9]

shape of x: (10, 2)
shape of y: (10,)

after splitting......
x_train: [[ 8  9]
 [ 0  1]
 [16 17]
 [14 15]
 [12 13]
 [ 4  5]
 [10 11]]

x_test: [[18 19]
 [ 2  3]
 [ 6  7]]

y_train: [4 0 8 7 6 2 5]

y_test: [9 1 3]



# Preprocessing Data

## Standardize data into zero mean and unit std

In [7]:
from sklearn import preprocessing
import numpy as np

x_train = np.array([[ 100., -1.,  2.],
                    [ 900.,  0.,  0.],
                    [ 200.,  1., -1.]])


print("mean of x_train: {}".format(x_train.mean(axis=0)))
print("std of x_train: {}\n".format(x_train.std(axis=0)))


scaler = preprocessing.StandardScaler().fit(x_train)

print("mean of x_scale: {}".format(scaler.mean_))
print("std of x_scale: {}\n".format(scaler.scale_))

# apply mean and std to standardize data
x_train = scaler.transform(x_train)

print("after standardiztion......")
print('x_train: {}'.format(x_train))


x_test = np.array([[-1., 1., 0.]])
print("apply same mean and std to new data(test data)\n")

x_test = scaler.transform(x_test)
print('x_test: {}'.format(x_test))



mean of x_train: [4.00000000e+02 0.00000000e+00 3.33333333e-01]
std of x_train: [355.9026084    0.81649658   1.24721913]

mean of x_scale: [4.00000000e+02 0.00000000e+00 3.33333333e-01]
std of x_scale: [355.9026084    0.81649658   1.24721913]

after standardiztion......
x_train: [[-0.84292723 -1.22474487  1.33630621]
 [ 1.40487872  0.         -0.26726124]
 [-0.56195149  1.22474487 -1.06904497]]
apply same mean and std to new data(test data)

x_test: [[-1.12671273  1.22474487 -0.26726124]]


## Standardize data into a range

In [8]:
from sklearn import preprocessing
import numpy as np

x_train = np.array([[ 1., -1.,  2.],
                    [ 2.,  0.,  0.],
                    [ 0.,  1., -1.]])

scaler = preprocessing.MinMaxScaler().fit(x_train)
x_train = scaler.transform(x_train)

print("after standardiztion......")
print('x_train: {}'.format(x_train))


x_test = np.array([[ -3., -1.,  4.]])
print("apply same transformation to new data(test data)\n")

x_test = scaler.transform(x_test)
print('x_test: {}'.format(x_test))

after standardiztion......
x_train: [[0.5        0.         1.        ]
 [1.         0.5        0.33333333]
 [0.         1.         0.        ]]
apply same transformation to new data(test data)

x_test: [[-1.5         0.          1.66666667]]


# Evaluate Result

In [9]:
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, confusion_matrix

y_test = [0, 1, 0 , 1, 0]
y_pred = [1, 0, 0 , 1, 0]

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
num_correct_samples = accuracy_score(y_test, y_pred, normalize=False)
con_matrix = confusion_matrix(y_test, y_pred)


print("Mean squared error: {}".format(mse))
print('r2 score: {}'.format(r2))
print('number of correct sample: {}'.format(num_correct_samples))
print('accuracy: {}'.format(accuracy))
print('confusion matrix: {}'.format(con_matrix))

Mean squared error: 0.4
r2 score: -0.6666666666666665
number of correct sample: 3
accuracy: 0.6
confusion matrix: [[2 1]
 [1 1]]


# Kaggle Introduction

kaggle website: https://www.kaggle.com/  
kaggle api: https://github.com/Kaggle/kaggle-api