In [1]:
import torch

In [2]:
torch.__version__

'1.3.0'

In [3]:
torch.version.cuda

'10.1'

In [16]:
torch.cuda.is_available()
torch.cuda.get_device_name(),
torch.cuda.get_device_properties(torch.cuda.current_device())

_CudaDeviceProperties(name='GeForce GTX 1060', major=6, minor=1, total_memory=6144MB, multi_processor_count=10)

_CudaDeviceProperties(name='GeForce GTX 1060', major=6, minor=1, total_memory=6144MB, multi_processor_count=10)

'GeForce GTX 1060'

# KNN Demo

In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go

## Prepare Dataset

In [3]:
filename='Social_Network_Ads.csv'
rds = pd.read_csv(f'Data/{filename}')

In [19]:
X=rds[['Age', 'EstimatedSalary']].values
Y=rds['Purchased'].values

## Training

In [35]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

In [27]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)

In [30]:
sc = StandardScaler()

In [31]:
X_train, X_test = sc.fit_transform(X_train), sc.transform(X_test)

In [43]:
classifier = KNeighborsClassifier(n_neighbors=5, metric='minkowski')
classifier.fit(X_train, Y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

## Testing

In [47]:
from sklearn.metrics import confusion_matrix

In [48]:
Y_pred = classifier.predict(X_test)

In [50]:
cm = confusion_matrix(Y_test, Y_pred)

In [51]:
cm

array([[75,  5],
       [ 5, 35]], dtype=int64)

## Viz

In [None]:
fig = go.Figure(data=go.Bar(y=[2, 3, 1]))

In [None]:
fig.write_html('IMG/plotly_helloworld.html', auto_open=True)

# LR

In [59]:
import pandas as pd
import numpy as np

## Prepare Dataset

In [60]:
filename='Salary_Data.csv'
rds = pd.read_csv(f'Data/{filename}')

In [64]:
import plotly.express as px
fig = px.scatter(rds,x="YearsExperience", y="Salary")
fig.show(renderer='iframe')

In [67]:
X=rds['YearsExperience'].values
Y=rds['Salary'].values

## Training

In [72]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [71]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)

In [81]:
X_train.reshape(1,-1)

array([[ 9.5,  8.7,  1.1,  3.7,  3.9,  4. ,  7.9,  4.5,  1.5,  2.2, 10.5,
         4.1,  5.9,  2.9,  4.9,  7.1,  5.3,  8.2,  9. ,  1.3,  3. ]])

In [95]:
X_train.reshape(1,-1).shape, X_train.shape

((1, 21), (21,))

In [96]:
regressor = LinearRegression()
regressor.fit(X_train.reshape(-1,1), Y_train.reshape(-1,1))

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

## Testing

In [117]:
Y_pred = regressor.predict(X_test.reshape(-1,1))
Y_pred

array([[ 63603.94911591],
       [123470.21625739],
       [ 74056.78941045],
       [ 82609.11328781],
       [ 44598.78494401],
       [ 56001.88344715],
       [ 90211.17895657],
       [116818.40879722],
       [ 56001.88344715]])

In [118]:
Y_pred.flatten()

array([ 63603.94911591, 123470.21625739,  74056.78941045,  82609.11328781,
        44598.78494401,  56001.88344715,  90211.17895657, 116818.40879722,
        56001.88344715])

## Viz

In [1]:
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

In [125]:
fig = go.Figure()
# fig.add_trace(go.Scatter(x=X, y=Y, mode='markers+lines', name='Raw'))
fig.add_trace(go.Scatter(x=X_train, y=Y_train, mode='markers', name='Train'))
fig.add_trace(go.Scatter(x=X_test, y=Y_test, mode='markers', name='Test'))
fig.add_trace(go.Scatter(x=X_test, y=Y_pred.flatten(), mode='markers+lines', name='Predict'))
fig.update_layout(title_text="Linear Regression Model",)
fig.update_xaxes(title_text='Experience By Year')
fig.update_yaxes(title_text='Salary')
fig.show(renderer='iframe')

# MLR

In [126]:
import pandas as pd
import numpy as np

## Prepare Dataset

In [176]:
filename='50_Startups.csv'
rds = pd.read_csv(f'Data/{filename}')

In [177]:
X=rds.iloc[:,:-1].values
Y=rds.iloc[:,-1].values.reshape(-1,1)

In [178]:
cities = rds.iloc[:,-2]
cities.unique()

array(['New York', 'California', 'Florida'], dtype=object)

In [179]:
X.shape, Y.shape

((50, 4), (50, 1))

## Training

In [180]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.preprocessing import
from sklearn.linear_model import LinearRegression

In [181]:
labelencoder = LabelEncoder()
X[:,3] = labelencoder.fit_transform(X[:,3])

In [194]:
onehotencoder = OneHotEncoder(categorical_features=[3])
X0 = onehotencoder.fit_transform(X).toarray()


The handling of integer data will change in version 0.22. Currently, the categories are determined based on the range [0, max(values)], while in the future they will be determined based on the unique values.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


The 'categorical_features' keyword is deprecated in version 0.20 and will be removed in 0.22. You can use the ColumnTransformer instead.



In [196]:
X_train, X_test, Y_train, Y_test = train_test_split(X0[:,1:], Y, test_size=0.2)

In [199]:
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

((40, 5), (10, 5), (40, 1), (10, 1))

In [200]:
regressor = LinearRegression()
regressor.fit(X_train, Y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

## Testing

In [202]:
Y_pred = regressor.predict(X_test)
Y_pred, Y_test

(array([[153655.01626395],
        [130540.83616682],
        [ 99235.91067317],
        [ 56716.44836416],
        [128051.69150323],
        [ 63731.3002788 ],
        [ 73635.03205154],
        [161962.80759298],
        [ 86013.15107891],
        [102456.79854904]]), array([[132602.65],
        [124266.9 ],
        [ 97427.84],
        [ 69758.98],
        [134307.35],
        [ 81229.06],
        [ 77798.83],
        [156991.12],
        [ 96479.51],
        [107404.34]]))

In [210]:
X

array([[165349.2, 136897.8, 471784.1, 2],
       [162597.7, 151377.59, 443898.53, 0],
       [153441.51, 101145.55, 407934.54, 1],
       [144372.41, 118671.85, 383199.62, 2],
       [142107.34, 91391.77, 366168.42, 1],
       [131876.9, 99814.71, 362861.36, 2],
       [134615.46, 147198.87, 127716.82, 0],
       [130298.13, 145530.06, 323876.68, 1],
       [120542.52, 148718.95, 311613.29, 2],
       [123334.88, 108679.17, 304981.62, 0],
       [101913.08, 110594.11, 229160.95, 1],
       [100671.96, 91790.61, 249744.55, 0],
       [93863.75, 127320.38, 249839.44, 1],
       [91992.39, 135495.07, 252664.93, 0],
       [119943.24, 156547.42, 256512.92, 1],
       [114523.61, 122616.84, 261776.23, 2],
       [78013.11, 121597.55, 264346.06, 0],
       [94657.16, 145077.58, 282574.31, 2],
       [91749.16, 114175.79, 294919.57, 1],
       [86419.7, 153514.11, 0.0, 2],
       [76253.86, 113867.3, 298664.47, 0],
       [78389.47, 153773.43, 299737.29, 2],
       [73994.56, 122782.75, 30331

## Viz

In [203]:
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

In [205]:
rds.columns

Index(['R&D Spend', 'Administration', 'Marketing Spend', 'State', 'Profit'], dtype='object')

In [284]:
[f"{i:<5.2f}" for i in Y_pred.flatten()]

['153655.02',
 '130540.84',
 '99235.91',
 '56716.45',
 '128051.69',
 '63731.30',
 '73635.03',
 '161962.81',
 '86013.15',
 '102456.80']

In [288]:
fig = go.Figure()
# fig.add_trace(go.Scatter(x=X, y=Y, mode='markers+lines', name='Raw'))
fig.add_trace(go.Scatter3d(x=X_train[:, -1], 
                           y=X_train[:, -2], 
                           z=X_train[:, -3],
                           hovertemplate="MS:%{x} <br>RD:%{y} <br>AD:%{z}",
                           marker=dict(size=12, 
                                       color = Y_pred.flatten(),                                    
                                       colorscale='Viridis',   # choose a colorscale
                                       showscale=True,
                                       opacity=0.8),                           
                           mode='markers',
                          name='Predict'))

fig.update_layout(title_text="Multi Linear Regression Model", 
                  scene={"aspectmode": "cube", 
                         "xaxis": {"title": "Marketing Spend"},
                       "yaxis": {"title": "R&D Spend" },
                       "zaxis": {"title": "Administration"}})

fig.show(renderer='iframe')
go.Scatter3d()

Scatter3d()

# Decision Tree

## Prepare Data

In [2]:
filename='adult_dataset.csv'
rds = pd.read_csv(f'Data/{filename}')

In [3]:
rds_cat = rds.select_dtypes(include=['object'])
rds_bool = rds_cat.apply(lambda elem: elem=='?', axis=0)
rds_bool['dropable'] = rds_bool.any(axis=1)

In [4]:
ds1 = rds.drop(rds_cat.columns, axis = 1)[~rds_bool['dropable']]
ds1_cat = rds_cat[~rds_bool['dropable']]

In [5]:
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
ds_cat = ds1_cat.apply(labelencoder.fit_transform)
ds = pd.concat([ds1, ds_cat], axis=1)
ds['income'] = ds['income'].astype('category')

In [6]:
X=ds.drop('income', axis=1)
Y=ds['income']

In [7]:
rds

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,22,Private,310152,Some-college,10,Never-married,Protective-serv,Not-in-family,White,Male,0,0,40,United-States,<=50K
32557,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32558,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32559,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K


## Training

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)

In [10]:
dt_default = DecisionTreeClassifier(max_depth=5)
dt_default.fit(X_train, Y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

## Testing

In [11]:
Y_pred_default = dt_default.predict(X_test)

In [12]:
print(classification_report(Y_test, Y_pred_default))

              precision    recall  f1-score   support

           0       0.86      0.95      0.91      6810
           1       0.79      0.54      0.64      2239

    accuracy                           0.85      9049
   macro avg       0.83      0.75      0.77      9049
weighted avg       0.85      0.85      0.84      9049



In [13]:
print(confusion_matrix(Y_test, Y_pred_default))

[[6494  316]
 [1034 1205]]


In [14]:
print(accuracy_score(Y_test, Y_pred_default))

0.8508122444469002


In [15]:
Y.name

'income'

## Viz

In [16]:
from sklearn.tree import export_graphviz
from IPython.core.display import display, HTML
from dtreeviz.trees import *

In [17]:
viz = dtreeviz(dt_default, X_train=X_train, 
               y_train=Y_train,
               feature_names=X.columns.to_list(), 
               target_name='income',
               class_names =['<=50K', '>50K']   ,
               fancy=False)

In [19]:
viz.view()  

# End