In [None]:
%matplotlib inline
import numpy as np
from matplotlib import pyplot as plt
from sklearn.linear_model import LinearRegression

np.random.seed(0)
X = np.random.random(size=(20, 1))
y = 3 * X[:, 0] + 2 + np.random.normal(size=20)

model = LinearRegression(fit_intercept=True)
model.fit(X, y)
print("Model coefficient: %.5f, and intercept: %.5f"
      % (model.coef_, model.intercept_))

# Plot the data and the model prediction
X_test = np.linspace(0, 1, 100)[:, np.newaxis]
y_test = model.predict(X_test)

plt.plot(X[:, 0], y, 'o')
plt.plot(X_test[:, 0], y_test)
plt.title('Linear regression with a single input variable');

### estimator

- **all Estimators**
  + `model.fit()` : training data 학습.  
       - supervised learning : `model.fit(X, y)`    
       - unsupervised learning : `model.fit(X)`  
               
               
- **supervised estimators**
  + `model.predict()` : 학습된 모델을 사용하여 새로운 데이터의 target 예측, `model.predict(X_new)`
  + `model.predict_proba()` : classification 에서 각 category로 분류될 확률 예측.
  + `model.score()` : classification 또는 regression 의 모델 정확도(0~1)
  
  
  
- **unsupervised estimators**
  + `model.transform()` : 데이터 변형
  + `model.fit_transform()` : fit & transform

In [None]:
from sklearn import datasets
from sklearn.cluster import KMeans
import pandas as pd

iris = datasets.load_iris()
x = pd.DataFrame(iris.data)
x.columns = ['Sepal_Length','Sepal_Width','Petal_Length','Petal_Width']
 
model = KMeans(n_clusters=3)

# Transform X to a cluster-distance space.
x_trans = model.fit_transform(x)
x_trans

### heterogeneous data type

In [None]:
import pandas as pd

data = pd.read_csv('data/titanic/train.csv')
data.head(5)

In [None]:
data.dtypes

In [None]:
y =  data['Survived']
y.head(5)

In [None]:
numerical_features = data[['Fare', 'Pclass', 'Age']]
numerical_features.head(5)

In [None]:
median_features = numerical_features.dropna().median()
imputed_features = numerical_features.fillna(median_features)
imputed_features.head(10)

In [None]:
pd.get_dummies(data['Sex'], prefix='Sex').head(5)

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

categorical_feagures = data['Sex']
new_categorical_feagures = le.fit_transform(categorical_feagures)
new_categorical_feagures

In [None]:
new_categorical_feagures = categorical_feagures.factorize()[0]
new_categorical_feagures

In [None]:
df = pd.DataFrame(data=new_categorical_feagures, columns=['Sex'])
new_features = pd.concat([imputed_features, df], axis=1)
new_features.head(5)