### Imports

In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from sklearn.metrics import confusion_matrix
from sklearn import preprocessing
import pandas as pd
from sklearn.cluster import KMeans 
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from ipywidgets import widgets
from IPython.display import display
from IPython.display import clear_output

### Debugging commands

In [2]:
#pip install pixiedust
import pixiedust
# %%pixie_debugger

# timer
%load_ext autotime

float_formatter = "{:.2f}".format
np.set_printoptions(formatter={'float_kind':float_formatter})

Pixiedust database opened successfully


### Data reading

#### SarparastChild

In [3]:
sarparastChild = pd.read_csv('Data/sc.csv')
# sarparastChild = pd.read_csv('Data/clusteredSarparastChild.csv')
bs = sarparastChild
bs=bs.drop('Unnamed: 0',axis=1)
bs['behzistirialls'].fillna(0,inplace=True)
bs['child_hoghogh'].fillna(0,inplace=True)
bs['child_behzistirialls'].fillna(0,inplace=True)
bs['child_melkrialls'].fillna(0,inplace=True)
bs['child_jobprice'].fillna(0,inplace=True)
bs['child_savari_car'].fillna(0,inplace=True)
bs['child_general_car'].fillna(0,inplace=True)
bs=bs[bs['child_hoghogh']>=0]
bs=bs[bs['hoghogh']>=0]
bs=bs[bs['melkrialls']>=0]
bs.fillna(-1,inplace=True)

time: 1.8 s


### Preprocess

In [4]:
bs['Mahane']=bs['hoghogh']+bs['jobprice']+bs['child_hoghogh']+bs['child_jobprice']
bs['Behzisti']=bs['behzistirialls']+bs['child_behzistirialls']
bs['Ziarati']=bs['ziarati_trip']+bs['child_ziarati_trip']
bs['Other_trip']=bs['other_trip']+bs['child_other_trip']
bs['Melk']=bs['melkrialls']+bs['child_melkrialls']
bs['Savari']=bs['savari_car']+bs['child_savari_car']
bs['General_Car']=bs['general_car']+bs['child_general_car']
bs['Yarane']=bs['Yarane_92']
dropList = ['Gender','isdeparted','Personid','Parent_Personid','IsRemoved','B97_Didari_FrstPrdBlnc', 'B97_Didari_AmDbtr',
       'B97_Didari_Bnft', 'B97_Didari_LstPrdBlnc',
       'B97_GheirDidari_FrstPrdBlnc', 'B97_GheirDidari_AmDbtr',
       'B97_GheirDidari_Bnft', 'B97_GheirDidari_LstPrdBlnc','hoghogh','jobprice',
            'child_hoghogh','behzistirialls','child_behzistirialls',
            'ziarati_trip','child_ziarati_trip','other_trip','child_other_trip',
           'melkrialls','child_melkrialls','child_jobprice','savari_car','child_savari_car',
           'general_car','child_general_car','Yarane_92']
bs=bs.drop(dropList,axis=1)

mask = (bs['ismostajer'] < 0) & (bs['Melk']<=0)
column_name = 'ismostajer'
bs.loc[mask, [column_name]] = 1
mask = (bs['ismostajer'] < 0) & (bs['Melk']>0)
column_name = 'ismostajer'
bs.loc[mask, [column_name]] = 0

bs=bs[bs['Ostan']!=-1]
bs=bs[bs['Shahrestan']!=-1]
OstSha = bs[['Ostan','Shahrestan']]

print('before:')
print (bs['Shahrestan'])

# Ostan
Ostan = preprocessing.LabelEncoder()
Ostan.fit(bs['Ostan'])
tmp = Ostan.transform(bs['Ostan'])
bs['Ostan'] = tmp

#Shahrestan
Shahrestan = preprocessing.LabelEncoder()
Shahrestan.fit(bs['Shahrestan'])
tmp = Shahrestan.transform(bs['Shahrestan'])
bs['Shahrestan'] = tmp

print('after:')
print (bs['Shahrestan'])

before:
5             ری
7         اصفهان
44          ساری
47          ساری
50           رشت
           ...  
499990       رشت
499991       رشت
499993       کرج
499994       رشت
499998    اردبیل
Name: Shahrestan, Length: 73262, dtype: object
after:
5         173
7          34
44        183
47        183
50        164
         ... 
499990    164
499991    164
499993    364
499994    164
499998     18
Name: Shahrestan, Length: 73262, dtype: int32
time: 130 ms


## App

In [5]:
def scoreFunc(inp):
    #return ((12*(inp['Mahane']-(0.5*inp['ismostajer']*inp['Mahane'])))/inp['familysize'])+inp['Behzisti']+inp['Melk']+inp['Savari']
    return (10*12.0*(inp['Mahane'])/(inp['familysize']+inp['ismostajer']))+inp['Behzisti']+inp['Melk']+(inp['Savari']*5)

def myDecile(me, originalData):
    od = originalData
    od = od.append(me)
    tmp =scoreFunc(od)
    mes=scoreFunc(me)[0]
    tmp1=pd.DataFrame()
    tmp1['Value']=tmp
    tmp1['Decile']= pd.qcut(tmp1['Value'],10,labels=False)
    tmp1=tmp1.sort_values(by=['Value'])
    res = tmp1[tmp1['Value']==mes]['Decile'][0]
    return res+1



def on_change(inpu):
    newOstan =inputs['Ostan'].value
    shah=OstSha[OstSha['Ostan']==newOstan]['Shahrestan']
    shah=shah.drop_duplicates()
    inputs['Shahrestan'].options=shah
    inputs['Shahrestan'].value=shah.iloc[0]

inputs = {}
featurelist=bs.columns
notIncludedlist=['Ziarati','General_Car','Yarane','Other_trip']
featurelist = list(set(featurelist)-set(notIncludedlist))
featurelist.remove('Ostan')
featurelist.remove('Shahrestan')
featurelist.insert(0,'Ostan')
featurelist.insert(1,'Shahrestan')

aux = bs[featurelist]
for col in aux.columns:
    if (col=='Ostan'):
        val = 'تهران'
        val = str(val)
        inputs[col]= widgets.Dropdown(
    options=Ostan.classes_,
    value=val,
    description='Ostan',
)
        inputs[col].observe(on_change)

    elif (col =='Shahrestan'):
        shah=OstSha[OstSha['Ostan']=='تهران']['Shahrestan']
        shah=shah.drop_duplicates()
        val = 'تهران'
        val = str(val)
        inputs[col]= widgets.Dropdown(
    options=shah,
    value=val,
    description='Shahrestan',
)       
    elif( col == 'ismostajer'):
        val = 'خیر'
        inputs[col]= widgets.Dropdown(
    options=['بله','خیر'],
    value=val,
    description='Mostajer',
)       
        
    else: 
        inputs[col]=widgets.Text(description=col,value='1')
    display(inputs[col])
    
button = widgets.Button(description="نتیجه")
output = widgets.Output()

display(button, output)

def on_button_clicked(b):
    with output:
        clear_output()
        test_input={}
        for key,value in inputs.items():
            val=value.value
            if (key=='Ostan'):
                val = Ostan.transform([val])[0]
            elif (key =='Shahrestan'):
                val = Shahrestan.transform([val])[0]
            elif (key =='ismostajer'):
                if(val=='بله'):
                    val=1
                else:
                    val=0
            val=float(val)
            test_input[key]=[val]
        test_input =  pd.DataFrame.from_dict(test_input)
        #print(test_input)
        dec =myDecile(test_input,aux)
        print ('شما در دهک',dec,'هستید.')
        print()
        ostInp=(int)(test_input['Ostan'][0])
        shahInp=(int)(test_input['Shahrestan'][0])
        
        
        ziaratiTotal=bs['Ziarati'].sum()
        tripTotal=bs['Other_trip'].sum()
        
        ostanZiarati = bs[bs['Ostan']==ostInp]['Ziarati'].sum()
        shahrZiarati = bs[bs['Shahrestan']==shahInp]['Ziarati'].sum()
        
        ostanZiaratiRatio=str(round(((ostanZiarati/ziaratiTotal)*100),2))+'%'
        shahrZiaratiRatio=str(round(((shahrZiarati/ostanZiarati)*100),2))+'%'
        
        print (ostanZiaratiRatio, 'از سفرهای زیارتی کشور از استان شما انجام شده است.')
        print (shahrZiaratiRatio, 'از سفرهای زیارتی استان از شهرستان شما انجام شده است.')
        print()
        ostanTrip = bs[bs['Ostan']==ostInp]['Other_trip'].sum()
        shahrTrip = bs[bs['Shahrestan']==shahInp]['Other_trip'].sum()
        
        ostanTripRatio=str(round(((ostanTrip/tripTotal)*100),2))+'%'
        shahrTripRatio=str(round(((shahrTrip/tripTotal)*100),2))+'%'
        
        print (ostanTripRatio, 'از سفرهای غیرزیارتی کشور از استان شما انجام شده است.')
        print (shahrTripRatio, 'از سفرهای غیرزیارتی استان از شهرستان شما انجام شده است.')
        print()
        bodeOstan=round(bs[bs['Ostan']==ostInp]['familysize'].mean(),1)
        bodeShahrestan=round(bs[bs['Shahrestan']==shahInp]['familysize'].mean(),1)
        
        print ('در استان شما به طور میانگین بعد خانوارها برابر',bodeOstan,'است.')
        print ('در شهرستان شما به طور میانگین بعد خانوارها برابر',bodeShahrestan,'است.')
        print()
        
        mostOstan=str(round((bs[bs['Ostan']==ostInp]['ismostajer'].mean()*100),2))+'%'
        mostShah=str(round((bs[bs['Shahrestan']==shahInp]['ismostajer'].mean()*100),2))+'%'
        print('درصد اجاره‌نشین‌ها در استان شما برابر ',mostOstan,'است.')
        print('درصد اجاره‌نشین‌ها در شهرستان شما برابر ',mostShah,'است.')
        print()
        
        daramadOstan=round(bs[bs['Ostan']==ostInp]['Mahane'].mean())
        daramadShahr=round(bs[bs['Shahrestan']==shahInp]['Mahane'].mean())
        
        mahan=int(test_input['Mahane'][0])
        ekhtOss=mahan-daramadOstan
        if ekhtOss>0:
            print('متوسط درآمد ماهانه خانوارها در استان شما برابر',f'{daramadOstan:,}','ريال است و درآمد شما از این متوسط، به میزان',f'{ekhtOss:,}','بیشتر است.')
        elif ekhtOss<0:
            ekhtOss=-1*ekhtOss
            print('متوسط درآمد ماهانه خانوارها در استان شما برابر',f'{daramadOstan:,}','ريال است و درآمد شما از این متوسط، به میزان',f'{ekhtOss:,}','کمتر است.')
        else:
            print('میزان درآمد خانوار شما برابر میانگین درآمد استان است.')
            
        ekhtSha=mahan-daramadShahr
        if ekhtSha>0:
            print('متوسط درآمد ماهانه خانوارها در شهرستان شما برابر',f'{daramadShahr:,}','ريال است و درآمد شما از این متوسط، به میزان',f'{ekhtSha:,}','بیشتر است.')
        elif ekhtSha<0:
            ekhtSha=-1*ekhtSha
            print('متوسط درآمد ماهانه خانوارها در شهرستان شما برابر',f'{daramadShahr:,}','ريال است و درآمد شما از این متوسط، به میزان',f'{ekhtSha:,}','کمتر است.')
        else:
            print('میزان درآمد خانوار شما برابر میانگین درآمد شهرستان است.')

button.on_click(on_button_clicked)

Dropdown(description='Ostan', index=7, options=('آذربایجان شرقی', 'آذربایجان غربی', 'اردبیل', 'اصفهان', 'البرز…

Dropdown(description='Shahrestan', index=2, options=('ری', 'پیشوا', 'تهران', 'بهارستان', 'پردیس', 'دماوند', 'پ…

Text(value='1', description='Behzisti')

Text(value='1', description='familysize')

Text(value='1', description='Savari')

Text(value='1', description='Melk')

Text(value='1', description='Mahane')

Dropdown(description='Mostajer', index=1, options=('بله', 'خیر'), value='خیر')

Button(description='نتیجه', style=ButtonStyle())

Output()

time: 126 ms


### Data Preview

In [10]:
bs.head()

Unnamed: 0,familysize,Ostan,Shahrestan,ismostajer,Mahane,Behzisti,Ziarati,Other_trip,Melk,Savari,General_Car,Yarane
5,2,7,173,0.0,12977347.0,0.0,0,0,690000000.0,-1.0,-1.0,1
7,1,3,34,0.0,-1.0,0.0,0,5,1800000000.0,-1.0,-1.0,0
44,4,19,183,0.0,76371687.0,0.0,2,0,1900000000.0,1200000000.0,-1.0,1
47,4,19,183,0.0,80424806.0,0.0,0,0,1730000000.0,450000000.0,-1.0,1
50,4,29,164,1.0,16229832.0,0.0,0,0,3977700000.0,450000000.0,-1.0,1


time: 13 ms


## Classifiers & Clusterings

### Decision Tree & K-Fold cross validation

In [11]:
#split dataset in features and target variable
feature_cols =(list)(bs.columns)
target_cols = ['Yarane']
feature_cols = list(set(feature_cols)-set(target_cols))
X = bs[feature_cols] # Features
y = bs[target_cols] # Target variable

# Split dataset into training set and test set
kf = KFold(n_splits=10)
Accuracy = []
Specifity=[]
Noname=[]
Recall=[]
Precision=[]
maxAcc=0
for train_index, test_index in kf.split(bs):
        train = bs.iloc[train_index]
        test = bs.iloc[test_index]
        # Create Decision Tree classifer object
        clf = DecisionTreeClassifier()

        X_train = train[feature_cols]
        y_train = train[target_cols]
        X_test = test[feature_cols]
        y_test = test[target_cols]
        # Train Decision Tree Classifer
        clf = clf.fit(X_train,y_train)
        
        #Predict the response for test dataset
        y_pred = clf.predict(X_test)
        #print('Report :',classification_report(y_test, y_pred))
        # Model Accuracy, how often is the classifier correct?
        acc = metrics.accuracy_score(y_test, y_pred)
        prec = metrics.precision_score(y_test, y_pred)
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        if (tn==0 and fp==0):
            spec = 1
        else:
            spec= tn/(tn+fp)
        rec=metrics.recall_score(y_test, y_pred)
        if (tn==0 and fn==0):
            non=1
        else:
            non = tn/(tn+fn)
        #چقدر درست حدس زده؟
        print('Accuracy =',acc)
        print()
        #چقدر از کل 0 های واقعی رو تونست حدس بزنه؟
        print('Specifity =',spec)
        #چقدر از کل 1 های واقعی رو تونست حدس بزنه؟
        print('Recall =',rec)
        print()
        #چقدر از اونایی که گفت 0 هستند واقعا 0 بودند؟
        print('Noname =',non)
        #چقدر از اونایی که گفت 1 هستند واقعا 1 بودند؟
        print('Precision =',prec)
        print('-----------')
        print()
        
        if (acc>maxAcc):
            maxClf=clf
            maxAcc=acc
            x_t= X_train
            y_t= y_train
        Accuracy.append(acc)
        Specifity.append(spec)
        Recall.append(rec)
        Noname.append(non)
        Precision.append(prec)
        
sum=0
for i in Accuracy:
    sum+=i
avg = sum/len(Accuracy)
print()
print('mean Accuracy =',avg)

sum=0
for i in Specifity:
    sum+=i
avg = sum/len(Specifity)
print()
print('mean Specifity =',avg)

sum=0
for i in Recall:
    sum+=i
avg = sum/len(Recall)
print()
print('mean Recall =',avg)

sum=0
for i in Noname:
    sum+=i
avg = sum/len(Noname)
print()
print('mean Noname =',avg)

sum=0
for i in Precision:
    sum+=i
avg = sum/len(Precision)
print()
print('mean Precision =',avg)

if (maxClf!=None):
    clf=maxClf
    X_train=x_t
    y_train=y_t

Accuracy = 0.8203903371093216

Specifity = 0.49589858314690527
Recall = 0.8930838623454728

Noname = 0.5095785440613027
Precision = 0.8877449352374627
-----------

Accuracy = 0.7935034802784223

Specifity = 0.5686160972785177
Recall = 0.8628571428571429

Noname = 0.5611428571428572
Precision = 0.8664156356464049
-----------

Accuracy = 0.7856947856947857

Specifity = 0.594048884165781
Recall = 0.8519470977222631

Noname = 0.581081081081081
Precision = 0.8585708996667901
-----------

Accuracy = 0.7831012831012831

Specifity = 0.5628227194492255
Recall = 0.8518717535375246

Noname = 0.5425884955752213
Precision = 0.8619064878579196
-----------

Accuracy = 0.8075348075348076

Specifity = 0.5292047853624209
Recall = 0.8745131244707874

Noname = 0.5036838580040187
Precision = 0.8853077318703926
-----------

Accuracy = 0.8158613158613158

Specifity = 0.514525993883792
Recall = 0.8813559322033898

Noname = 0.4852198990627253
Precision = 0.8930796430375484
-----------

Accuracy = 0.80862680862

### KNN

In [None]:
#split dataset in features and target variable
feature_cols =(list)(bs.columns)
target_cols = ['Yarane']
feature_cols = list(set(feature_cols)-set(target_cols))
X = bs[feature_cols] # Features
y = bs[target_cols] # Target variable

# Split dataset into training set and test set
kf = KFold(n_splits=10)
Accuracy = []
Specifity=[]
Noname=[]
Recall=[]
Precision=[]
maxAcc=0
for train_index, test_index in kf.split(bs):
        train = bs.iloc[train_index]
        test = bs.iloc[test_index]
        # Create Decision Tree classifer object
        clf =  KNeighborsClassifier(n_neighbors=10)

        X_train = train[feature_cols]
        y_train = train[target_cols]
        X_test = test[feature_cols]
        y_test = test[target_cols]
        # Train Decision Tree Classifer
        clf = clf.fit(X_train,y_train)
        
        #Predict the response for test dataset
        y_pred = clf.predict(X_test)
        #print('Report :',classification_report(y_test, y_pred))
        # Model Accuracy, how often is the classifier correct?
        acc = metrics.accuracy_score(y_test, y_pred)
        prec = metrics.precision_score(y_test, y_pred)
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        if (tn==0 and fp==0):
            spec = 1
        else:
            spec= tn/(tn+fp)
        rec=metrics.recall_score(y_test, y_pred)
        if (tn==0 and fn==0):
            non=1
        else:
            non = tn/(tn+fn)
        #چقدر درست حدس زده؟
        print('Accuracy =',acc)
        print()
        #چقدر از کل 0 های واقعی رو تونست حدس بزنه؟
        print('Specifity =',spec)
        #چقدر از کل 1 های واقعی رو تونست حدس بزنه؟
        print('Recall =',rec)
        print()
        #چقدر از اونایی که گفت 0 هستند واقعا 0 بودند؟
        print('Noname =',non)
        #چقدر از اونایی که گفت 1 هستند واقعا 1 بودند؟
        print('Precision =',prec)
        print('-----------')
        print()
        
        if (acc>maxAcc):
            maxClf=clf
            maxAcc=acc
            x_t= X_train
            y_t= y_train
        Accuracy.append(acc)
        Specifity.append(spec)
        Recall.append(rec)
        Noname.append(non)
        Precision.append(prec)
        
sum=0
for i in Accuracy:
    sum+=i
avg = sum/len(Accuracy)
print()
print('mean Accuracy =',avg)

sum=0
for i in Specifity:
    sum+=i
avg = sum/len(Specifity)
print()
print('mean Specifity =',avg)

sum=0
for i in Recall:
    sum+=i
avg = sum/len(Recall)
print()
print('mean Recall =',avg)

sum=0
for i in Noname:
    sum+=i
avg = sum/len(Noname)
print()
print('mean Noname =',avg)

sum=0
for i in Precision:
    sum+=i
avg = sum/len(Precision)
print()
print('mean Precision =',avg)

if (maxClf!=None):
    clf=maxClf
    X_train=x_t
    y_train=y_t

### K-Means

In [None]:
cols =(list)(bs.columns)
notIncluded_cols = ['Ostan','Shahrestan','IsRemoved']
cols = list(set(cols)-set(notIncluded_cols))
aux = bs[cols] # Features

clusters = 10
kmeans = KMeans(n_clusters = clusters) 
kmeans = kmeans.fit(aux)
kmeans.cluster_centers_
centroids=pd.DataFrame(kmeans.cluster_centers_,columns=aux.columns)
centroids
aux['cluster'] = kmeans.labels_.tolist()