### ** Use f1 score to evaluate classifier for myopic detection**

**1 Load libs**

In [0]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import log_loss
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

**2. load data**

In [0]:
df = pd.read_csv('MYOPIA.csv')

In [18]:
df.shape

(618, 18)

In [19]:
df.head()

Unnamed: 0,ID,STUDYYEAR,MYOPIC,AGE,GENDER,SPHEQ,AL,ACD,LT,VCD,SPORTHR,READHR,COMPHR,STUDYHR,TVHR,DIOPTERHR,MOMMY,DADMY
0,1,1992,1,6,1,-0.052,21.889999,3.69,3.498,14.7,45,8,0,0,10,34,1,1
1,2,1995,0,6,1,0.608,22.379999,3.702,3.392,15.29,4,0,1,1,7,12,1,1
2,3,1991,0,6,1,1.179,22.49,3.462,3.514,15.52,14,0,2,0,10,14,0,0
3,4,1990,1,6,1,0.525,22.200001,3.862,3.612,14.73,18,11,0,0,4,37,0,1
4,5,1995,0,5,0,0.697,23.290001,3.676,3.454,16.16,14,0,0,0,4,4,1,0


**3 preprocessing data and prepare train and test data set**

In [0]:
df.drop('ID',axis=1,inplace=True)

In [0]:
# please drop AGE and GENDER together?
# df.drop(['AGE','GENDER'],axis=1,inplace=True)

In [22]:
df.head()

Unnamed: 0,STUDYYEAR,MYOPIC,AGE,GENDER,SPHEQ,AL,ACD,LT,VCD,SPORTHR,READHR,COMPHR,STUDYHR,TVHR,DIOPTERHR,MOMMY,DADMY
0,1992,1,6,1,-0.052,21.889999,3.69,3.498,14.7,45,8,0,0,10,34,1,1
1,1995,0,6,1,0.608,22.379999,3.702,3.392,15.29,4,0,1,1,7,12,1,1
2,1991,0,6,1,1.179,22.49,3.462,3.514,15.52,14,0,2,0,10,14,0,0
3,1990,1,6,1,0.525,22.200001,3.862,3.612,14.73,18,11,0,0,4,37,0,1
4,1995,0,5,0,0.697,23.290001,3.676,3.454,16.16,14,0,0,0,4,4,1,0


In [0]:
# extract target variable: MYOPIC
y = df['MYOPIC']

In [0]:
df.drop('MYOPIC',axis=1,inplace=True)

In [25]:
df.head()

Unnamed: 0,STUDYYEAR,AGE,GENDER,SPHEQ,AL,ACD,LT,VCD,SPORTHR,READHR,COMPHR,STUDYHR,TVHR,DIOPTERHR,MOMMY,DADMY
0,1992,6,1,-0.052,21.889999,3.69,3.498,14.7,45,8,0,0,10,34,1,1
1,1995,6,1,0.608,22.379999,3.702,3.392,15.29,4,0,1,1,7,12,1,1
2,1991,6,1,1.179,22.49,3.462,3.514,15.52,14,0,2,0,10,14,0,0
3,1990,6,1,0.525,22.200001,3.862,3.612,14.73,18,11,0,0,4,37,0,1
4,1995,5,0,0.697,23.290001,3.676,3.454,16.16,14,0,0,0,4,4,1,0


In [0]:
X = df

use KFold to split your data set

In [0]:
from sklearn.model_selection import KFold

In [0]:
kf = KFold(n_splits=5,shuffle=True,random_state=886)

In [0]:
# x = np.array([0.2,0.7,12,98,4])
# for i,(idx1,idx2) in enumerate(kf.split(x)):
#   print('we are doing {}th cross'.format(i+1))
#   print(idx1,idx2)
#   print(x[idx1],x[idx2])
#   print('------------------')

In [40]:
X.head()

Unnamed: 0,STUDYYEAR,AGE,GENDER,SPHEQ,AL,ACD,LT,VCD,SPORTHR,READHR,COMPHR,STUDYHR,TVHR,DIOPTERHR,MOMMY,DADMY
0,1992,6,1,-0.052,21.889999,3.69,3.498,14.7,45,8,0,0,10,34,1,1
1,1995,6,1,0.608,22.379999,3.702,3.392,15.29,4,0,1,1,7,12,1,1
2,1991,6,1,1.179,22.49,3.462,3.514,15.52,14,0,2,0,10,14,0,0
3,1990,6,1,0.525,22.200001,3.862,3.612,14.73,18,11,0,0,4,37,0,1
4,1995,5,0,0.697,23.290001,3.676,3.454,16.16,14,0,0,0,4,4,1,0


In [44]:
for i,(idx1,idx2) in enumerate(kf.split(X)):
  train_X = X.loc[idx1]
  train_y = y[idx1]
  
  valid_X = X.loc[idx2]
  valid_y = y[idx2]
  
  model = LogisticRegression()
  model.fit(train_X,train_y)
  pred_y = model.predict(valid_X)
  print('current f1 score is:{}'.format(f1_score(valid_y,(pred_y>0.5)*1)))

current f1 score is:0.41379310344827586
current f1 score is:0.3157894736842105
current f1 score is:0.3703703703703704
current f1 score is:0.46153846153846156
current f1 score is:0.13333333333333333


**Naive bayes for spam mail detection **

In [0]:
import string

In [67]:
spam = pd.read_csv('spam.csv',encoding='latin-1')
print(spam.shape)

(5572, 5)


In [68]:
spam.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [0]:
spam.drop(['Unnamed: 2',	'Unnamed: 3',	'Unnamed: 4'],axis=1,inplace=True)

In [50]:
spam.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


**Preprocessing your text**

In [0]:
def process_text(text):
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    clean_words = [word for word in nopunc.split(' ') if word.lower() not in stopwords.words('english')]
    return clean_words

In [0]:
import nltk
nltk.download('stopwords')

In [0]:
spam['v2'] = spam['v2'].apply(lambda x:process_text(x))

In [71]:
spam.head()

Unnamed: 0,v1,v2
0,ham,"[Go, jurong, point, crazy, Available, bugis, n..."
1,ham,"[Ok, lar, Joking, wif, u, oni]"
2,spam,"[Free, entry, 2, wkly, comp, win, FA, Cup, fin..."
3,ham,"[U, dun, say, early, hor, U, c, already, say]"
4,ham,"[Nah, dont, think, goes, usf, lives, around, t..."


In [0]:
from sklearn.feature_extraction.text import CountVectorizer

In [0]:
spam['v2'] = CountVectorizer(analyzer=process_text).fit_transform(spam['v2'])

In [0]:
vec_tokens = CountVectorizer(min_df=2,max_features=20000,analyzer=process_text).fit_transform(spam['v2'])

In [73]:
vec_tokens

<5572x314 sparse matrix of type '<class 'numpy.int64'>'
	with 772 stored elements in Compressed Sparse Row format>