In [1]:
import pandas as pd
import numpy as np

from sklearn import datasets
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

import matplotlib.pyplot as plt


In [2]:
# Dane do regresji
diabetes_data = datasets.load_diabetes(return_X_y=True, as_frame=True)

diabetes_X_train, diabetes_X_test, diabetes_y_train, diabetes_y_test = train_test_split(
    diabetes_data[0],
    diabetes_data[1],
    test_size=0.3,
    random_state=42
)

# Dane do klasyfikacji
bc_data = datasets.load_breast_cancer(return_X_y=True, as_frame=True)

bc_X_train, bc_X_test, bc_y_train, bc_y_test = train_test_split(
    bc_data[0],
    bc_data[1],
    test_size=0.3,
    random_state=42
)


In [4]:
diabetes_X_train

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
225,0.030811,0.050680,0.032595,0.049415,-0.040096,-0.043589,-0.069172,0.034309,0.063015,0.003064
412,0.074401,-0.044642,0.085408,0.063187,0.014942,0.013091,0.015505,-0.002592,0.006207,0.085907
118,-0.056370,0.050680,-0.010517,0.025315,0.023198,0.040022,-0.039719,0.034309,0.020609,0.056912
114,0.023546,-0.044642,0.110198,0.063187,0.013567,-0.032942,-0.024993,0.020655,0.099241,0.023775
364,0.001751,0.050680,-0.006206,-0.019442,-0.009825,0.004949,-0.039719,0.034309,0.014821,0.098333
...,...,...,...,...,...,...,...,...,...,...
106,-0.096328,-0.044642,-0.076264,-0.043542,-0.045599,-0.034821,0.008142,-0.039493,-0.059471,-0.083920
270,0.005383,0.050680,0.030440,0.083844,-0.037344,-0.047347,0.015505,-0.039493,0.008641,0.015491
348,0.030811,-0.044642,-0.020218,-0.005670,-0.004321,-0.029497,0.078093,-0.039493,-0.010903,-0.001078
435,-0.012780,-0.044642,-0.023451,-0.040099,-0.016704,0.004636,-0.017629,-0.002592,-0.038460,-0.038357


## Regresja liniowa

Dokumentacja:

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html

![image info](./image/linear.png)


In [5]:
# Wczytanie modelu
regr = LinearRegression()

# Trenowanie modelu
regr.fit(diabetes_X_train, diabetes_y_train)

# Predykcja
regr_preds = regr.predict(diabetes_X_test)
print('MSE: ', mean_squared_error(diabetes_y_test, regr_preds))


MSE:  2821.7509810013094


In [6]:
regr_preds

array([138.46970031, 181.10052342, 125.34400904, 292.75977277,
       123.8830531 ,  91.89830434, 257.26463123, 177.76169318,
        84.98549706, 109.15960992,  94.4855284 , 166.13235108,
        57.40742502, 206.13897354,  97.7811842 , 131.00472765,
       214.29789972, 252.52907661, 199.66656381, 219.49985634,
       203.23671317,  88.00656925,  70.65108459, 186.92233427,
       155.81266751, 162.81022205, 191.93135706, 184.72924276,
        46.62920829, 108.26566599, 178.14743952,  91.35065005,
       127.72125745, 184.04205666, 172.23799897, 189.51548863,
       121.83265708, 117.75339206, 145.67950306,  58.57563401,
        72.55421321, 107.27571105, 166.15280313, 153.29339984,
       165.19282154,  62.95752038,  73.58909449, 110.05656189,
        54.49723354, 164.88920632, 152.49472803,  63.8874565 ,
       111.4354561 , 108.34936269, 179.96973311, 158.70054112,
        95.04833555, 209.68990406, 118.37356519,  69.69946093,
       189.04680627, 204.99138626, 140.26840176, 105.75

## Regresja logistyczna

Dokumentacja:

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

![image info](./image/logistic.png)


In [7]:
# Wczytanie modelu
lregr = LogisticRegression(max_iter=10000)

# Trenowanie modelu
lregr.fit(bc_X_train, bc_y_train)

# Predykcja
lregr_preds = lregr.predict(bc_X_test)
print('Accuracy: ', accuracy_score(bc_y_test, lregr_preds))


Accuracy:  0.9766081871345029


## SVM

Dokumentacja:

https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html

![image info](./image/svm.png)



In [8]:
# Wczytywanie modelu
svc = SVC()

# Trenowanie modelu
svc.fit(bc_X_train, bc_y_train)

# Predykcja
svc_preds = svc.predict(bc_X_test)
print('Accuracy: ', accuracy_score(bc_y_test, svc_preds))


Accuracy:  0.935672514619883


# Drzewo decyzyjne

Dokumentacja:

https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html

![image info](./image/dt.png)



In [9]:
# Wczytywanie modelu
clf = DecisionTreeClassifier()

# Trenowanie modelu
clf.fit(bc_X_train, bc_y_train)

# Predykcja
clf_preds = clf.predict(bc_X_test)
print('Accuracy: ', accuracy_score(bc_y_test, clf_preds))


Accuracy:  0.9415204678362573


## MiNI Hackathon

Zadanie polega na przewidywaniu, czy dana osoba przetrwa katastrofę Titanica.

Zalecane kroki do wypróbowania:

- obróbka danych (uzupełnienie braków, usuwanie skrajnych wartości, niepotrzebnych kolumn?)
- skalowanie danych?
- zastosowanie modeli z warsztatu, dobór hiperparametrów

Opis danych:

![image info](./image/columns.png)


In [114]:
# had to load data with relative path
if "02_modele/Python" not in os.path.abspath(os.curdir):
    os.chdir("02_modele/Python/")
train_df = pd.read_csv('./data/train.csv')
test_df = pd.read_csv('./data/test.csv')

In [93]:
train_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,IsWoman,IsMan
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,Deck,S,False,True
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,True,False
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,Deck,S,True,False
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,True,False
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,Deck,S,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
786,787,1,3,"Sjoblom, Miss. Anna Sofia",female,18.0,0,0,3101265,7.4958,Deck,S,True,False
787,788,0,3,"Rice, Master. George Hugh",male,8.0,4,1,382652,29.1250,Deck,Q,False,True
788,789,1,3,"Dean, Master. Bertram Vere",male,1.0,1,2,C.A. 2315,20.5750,Deck,S,False,True
789,790,0,1,"Guggenheim, Mr. Benjamin",male,46.0,0,0,PC 17593,79.2000,B82 B84,C,False,True


In [84]:
train_df['Age'] = train_df['Age'].mask(train_df['Age'].isna(), 0)
test_df['Age'] = test_df['Age'].mask(test_df['Age'].isna(), 0)

In [85]:
train_df['Cabin'] = train_df['Cabin'].mask(train_df['Cabin'].isna(), 'Deck')
test_df['Cabin'] = test_df['Cabin'].mask(test_df['Cabin'].isna(), 'Deck')

In [86]:
train_df['Embarked'] = train_df['Embarked'].mask(train_df['Embarked'].isna(), 'S')
test_df['Embarked'] = test_df['Embarked'].mask(test_df['Embarked'].isna(), 'S')

In [110]:
train_df['IsWoman'] = train_df['Sex']=='female'
test_df['IsWoman'] = test_df['Sex']=='female'
train_df['IsMan'] = train_df['Sex']=='male'
test_df['IsMan'] = test_df['Sex']=='male'
train_df['IsCabin'] = train_df['Cabin']!='No cabin'
train_df['IsNotCabin'] = train_df['Cabin']=='No cabin'
test_df['IsCabin'] = test_df['Cabin']!='No cabin'
test_df['IsNotCabin'] = test_df['Cabin']=='No cabin'

test_df['NameLen'] = test_df['Name'].apply(len)
train_df['NameLen'] = train_df['Name'].apply(len)

train_df['EmbarkS'] = train_df['Embarked']=='S'
train_df['EmbarkQ'] = train_df['Embarked']=='Q'
train_df['EmbarkC'] = train_df['Embarked']=='C'
test_df['EmbarkS'] = test_df['Embarked']=='S'
test_df['EmbarkQ'] = test_df['Embarked']=='Q'
test_df['EmbarkC'] = test_df['Embarked']=='C'

clf = DecisionTreeClassifier()

columns = [
       'Pclass',
       'Age',
       'SibSp',
       'Parch',
       'Fare',
       'IsWoman',
       'IsMan',
       'EmbarkS',
       'EmbarkQ',
       'EmbarkC'
]

clf.fit(train_df[columns], train_df['Survived'])

clf_preds = clf.predict(test_df[columns])
print('Accuracy: ', accuracy_score(test_df['Survived'], clf_preds))

Accuracy:  0.82
