In [58]:
import random

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf

from sklearn import linear_model
from sklearn.datasets import load_boston
from sklearn.datasets import make_classification
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelBinarizer


%matplotlib inline
%load_ext autoreload
%autoreload 2

np.set_printoptions(precision=5, suppress=True)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [15]:
outlook = ['sunny', 'overcat', 'rainy']
temperature = ['hot', 'mild', 'cool']
humidity = ['high', 'normal']
windy = ['TRUE', 'FALSE']

weather_dataset = list()

for o in outlook:
    for t in temperature:
        for h in humidity:
            for w in windy:
                weather_dataset.append([o, t, h, w])

play = [0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 
        1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 
        0, 0, 0, 1]

In [16]:
len(play)

36

In [17]:
df = pd.DataFrame(weather_dataset, columns=['outlook', 'temperature', 'humidity', 'windy'])

In [18]:
pd.get_dummies(df.humidity).iloc[:5, :]

Unnamed: 0,high,normal
0,1,0
1,1,0
2,0,1
3,0,1
4,1,0


In [19]:
dummy_encoding = pd.get_dummies(df)
dummy_encoding

Unnamed: 0,outlook_overcat,outlook_rainy,outlook_sunny,temperature_cool,temperature_hot,temperature_mild,humidity_high,humidity_normal,windy_FALSE,windy_TRUE
0,0,0,1,0,1,0,1,0,0,1
1,0,0,1,0,1,0,1,0,1,0
2,0,0,1,0,1,0,0,1,0,1
3,0,0,1,0,1,0,0,1,1,0
4,0,0,1,0,0,1,1,0,0,1
5,0,0,1,0,0,1,1,0,1,0
6,0,0,1,0,0,1,0,1,0,1
7,0,0,1,0,0,1,0,1,1,0
8,0,0,1,1,0,0,1,0,0,1
9,0,0,1,1,0,0,1,0,1,0


In [21]:
X = sm.add_constant(dummy_encoding)
logit = sm.Logit(play, X)
result = logit.fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.292346
         Iterations 17


  bse_ = np.sqrt(np.diag(self.cov_params()))
  return (a < x) & (x < b)
  return (a < x) & (x < b)
  cond2 = cond0 & (x <= _a)


0,1,2,3
Dep. Variable:,y,No. Observations:,36.0
Model:,Logit,Df Residuals:,29.0
Method:,MLE,Df Model:,6.0
Date:,"Sat, 27 Jun 2020",Pseudo R-squ.:,0.5744
Time:,06:25:05,Log-Likelihood:,-10.524
converged:,True,LL-Null:,-24.731
Covariance Type:,nonrobust,LLR p-value:,7.856e-05

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.2393,,,,,
outlook_overcat,2.9833,5.85e+07,5.1e-08,1.000,-1.15e+08,1.15e+08
outlook_rainy,-2.1746,6.15e+07,-3.54e-08,1.000,-1.2e+08,1.2e+08
outlook_sunny,-0.5695,6.17e+07,-9.23e-09,1.000,-1.21e+08,1.21e+08
temperature_cool,-2.1996,5.38e+07,-4.09e-08,1.000,-1.05e+08,1.05e+08
temperature_hot,0.3045,5.38e+07,5.66e-09,1.000,-1.05e+08,1.05e+08
temperature_mild,2.1344,5.38e+07,3.97e-08,1.000,-1.05e+08,1.05e+08
humidity_high,-2.0459,,,,,
humidity_normal,2.2851,,,,,


In [23]:
X.drop(['outlook_sunny', 'temperature_mild', 'humidity_normal', 'windy_FALSE'], inplace=True, axis=1)

In [24]:
logit = sm.Logit(play, X)
result = logit.fit()
print(result.summary())

Optimization terminated successfully.
         Current function value: 0.292346
         Iterations 8
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                   36
Model:                          Logit   Df Residuals:                       29
Method:                           MLE   Df Model:                            6
Date:                Sat, 27 Jun 2020   Pseudo R-squ.:                  0.5744
Time:                        06:28:25   Log-Likelihood:                -10.524
converged:                       True   LL-Null:                       -24.731
Covariance Type:            nonrobust   LLR p-value:                 7.856e-05
                       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------
const                5.4055      2.196      2.462      0.014       1.102       9.709
outlook_ove

### DictVectorizer 와 one-hot encoding

In [32]:
vectorizer = DictVectorizer(sparse = False)
dict_representation = [{varname:var for var, varname in zip(
    row,
    ['outlook', 'temperature', 'humidity', 'windy']
)} for row in weather_dataset]
print(dict_representation[0])
print(vectorizer.fit_transform(dict_representation))

{'outlook': 'sunny', 'temperature': 'hot', 'humidity': 'high', 'windy': 'TRUE'}
[[1. 0. 0. 0. 1. 0. 1. 0. 0. 1.]
 [1. 0. 0. 0. 1. 0. 1. 0. 1. 0.]
 [0. 1. 0. 0. 1. 0. 1. 0. 0. 1.]
 [0. 1. 0. 0. 1. 0. 1. 0. 1. 0.]
 [1. 0. 0. 0. 1. 0. 0. 1. 0. 1.]
 [1. 0. 0. 0. 1. 0. 0. 1. 1. 0.]
 [0. 1. 0. 0. 1. 0. 0. 1. 0. 1.]
 [0. 1. 0. 0. 1. 0. 0. 1. 1. 0.]
 [1. 0. 0. 0. 1. 1. 0. 0. 0. 1.]
 [1. 0. 0. 0. 1. 1. 0. 0. 1. 0.]
 [0. 1. 0. 0. 1. 1. 0. 0. 0. 1.]
 [0. 1. 0. 0. 1. 1. 0. 0. 1. 0.]
 [1. 0. 1. 0. 0. 0. 1. 0. 0. 1.]
 [1. 0. 1. 0. 0. 0. 1. 0. 1. 0.]
 [0. 1. 1. 0. 0. 0. 1. 0. 0. 1.]
 [0. 1. 1. 0. 0. 0. 1. 0. 1. 0.]
 [1. 0. 1. 0. 0. 0. 0. 1. 0. 1.]
 [1. 0. 1. 0. 0. 0. 0. 1. 1. 0.]
 [0. 1. 1. 0. 0. 0. 0. 1. 0. 1.]
 [0. 1. 1. 0. 0. 0. 0. 1. 1. 0.]
 [1. 0. 1. 0. 0. 1. 0. 0. 0. 1.]
 [1. 0. 1. 0. 0. 1. 0. 0. 1. 0.]
 [0. 1. 1. 0. 0. 1. 0. 0. 0. 1.]
 [0. 1. 1. 0. 0. 1. 0. 0. 1. 0.]
 [1. 0. 0. 1. 0. 0. 1. 0. 0. 1.]
 [1. 0. 0. 1. 0. 0. 1. 0. 1. 0.]
 [0. 1. 0. 1. 0. 0. 1. 0. 0. 1.]
 [0. 1. 0. 1. 0. 0. 1. 0. 1. 

In [33]:
print(vectorizer.feature_names_)

['humidity=high', 'humidity=normal', 'outlook=overcat', 'outlook=rainy', 'outlook=sunny', 'temperature=cool', 'temperature=hot', 'temperature=mild', 'windy=FALSE', 'windy=TRUE']


In [37]:
label_encoder = LabelEncoder()
label_encoder.fit_transform(df.outlook)

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [39]:
label_encoder.inverse_transform([0, 1, 2])

array(['overcat', 'rainy', 'sunny'], dtype=object)

In [40]:
label_encoder.classes_

array(['overcat', 'rainy', 'sunny'], dtype=object)

In [42]:
label_binarizer = LabelBinarizer()
label_binarizer.fit_transform(label_encoder.fit_transform(df.outlook))

array([[0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0]])

### 특성 해셔

In [44]:
your_text = 'Nomina sunt consequentia rerum'
mapping_words_in_text = {word:position for position, word in enumerate(set(your_text.lower().split(' ')))}
mapping_words_in_text

{'rerum': 0, 'consequentia': 1, 'nomina': 2, 'sunt': 3}

In [45]:
corpus = [
    'The quick for jumped over the lazy dog',
    'I sought a dog wondering around with a bird',
    'My dog is named Fido'
]

In [50]:
textual_one_hot_encoder = CountVectorizer(binary=True)

In [51]:
textual_one_hot_encoder.fit(corpus)

CountVectorizer(binary=True)

In [52]:
vectorized_text = textual_one_hot_encoder.transform(corpus)

In [54]:
vectorized_text.todense()

matrix([[0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0],
        [1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1],
        [0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0]])

In [55]:
textual_one_hot_encoder.get_feature_names()

['around',
 'bird',
 'dog',
 'fido',
 'for',
 'is',
 'jumped',
 'lazy',
 'my',
 'named',
 'over',
 'quick',
 'sought',
 'the',
 'with',
 'wondering']

In [57]:
textual_one_hot_encoder.transform(['Jone went home today']).todense()

matrix([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [60]:
hashing_trick = HashingVectorizer(n_features=11, binary=True, norm=None)
M = hashing_trick.transform(corpus)
M.todense()

matrix([[1., 1., 0., 1., 1., 0., 0., 1., 0., 0., 0.],
        [0., 0., 0., 1., 0., 1., 0., 1., 1., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 1., 1., 0., 0.]])

In [62]:
hashing_trick.transform(['John is the owner of that dog']).todense()

matrix([[1., 0., 1., 1., 0., 1., 0., 1., 0., 1., 0.]])