In [50]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split , GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from io import StringIO
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC,SVC
from sklearn.metrics import classification_report,make_scorer,f1_score
from itertools import product
from sklearn.model_selection import KFold,StratifiedKFold
import warnings
warnings.filterwarnings("ignore")

## 細節 :

* 調參過程中 tfidf fit時 用全部可用train fit 而不是用 skfold過的 部分train label True 當作 fit


In [51]:
def df_(df):
    df  = df [pd.notnull(df ['class'])]
    col = ['tweet', 'class']
    df = df[col]
    df.columns = ['tweet', 'label']
    label = df.label
    return df , label

In [52]:
def ML_method_one(tfidf_parameters,ML_model,ML_model_parameters,cv_times,train,train_label):

    tfidf = TfidfVectorizer(sublinear_tf=True, min_df=2,max_df=15, norm='l2', encoding='utf-8', ngram_range=(1, 3), stop_words='english',max_features =None)
    kf = StratifiedKFold( n_splits = cv_times )
    kf.get_n_splits(train,train_label)
    
    tunelist_tfidf = [ dict( zip( tfidf_parameters , v ) ) for v in product ( *tfidf_parameters.values() ) ]
    tunelist_ML_model = [ dict( zip( ML_model_parameters , v ) ) for v in product ( *ML_model_parameters.values() ) ]
    
    tfidf_parameter_best_scores = []
    ML_model_best_scores_parameter = []
    
    for tfidf_parameter in tunelist_tfidf :

        tfidf.set_params( **tfidf_parameter )
        ML_model_parameter_scores = []

        for ML_model_parameter in tunelist_ML_model :

            ML_model.set_params( **ML_model_parameter )
            cv_scores_just_care_label_1_F1 = []
            
            num = 1
            
            print( 'tfidf para : ' , tfidf_parameter )
            print( 'Model para : ' , ML_model_parameter)
            
            for train_index , test_index in kf.split(train,train_label):

                cv_train , cv_test = train.loc[train_index], train.loc[test_index]
                cv_train_label , cv_test_label = train_label[train_index], train_label[test_index]
                
                tfidf.fit(train.loc[np.where(train_label.values==1)[0]].tweet.apply(lambda x: np.str_(x)) ) 

                cv_train = tfidf.transform( cv_train.tweet.apply( lambda x: np.str_(x) ) ) 
                cv_test = tfidf.transform( cv_test.tweet.apply( lambda x: np.str_(x) ) )

                ML_model.fit(cv_train,cv_train_label,)
                cv_pred = ML_model.predict(cv_test)

                cv_scores_just_care_label_1_F1.append( float(classification_report(cv_test_label,cv_pred,digits=5).split()[12]) )
                now_score = float(classification_report(cv_test_label,cv_pred,digits=5).split()[12])
                print('\t第'+str(num)+'次kfold，label 1 - F1 score : ' + str(now_score))

                num+=1
                
            ML_model_parameter_scores.append(np.mean(cv_scores_just_care_label_1_F1))
            print('\t此參數kfold平均為'+str(np.mean(cv_scores_just_care_label_1_F1)))
        tfidf_parameter_best_scores.append( np.max( np.array(ML_model_parameter_scores) ) )
        ML_model_best_scores_parameter.append(tunelist_ML_model[ np.argmax(np.array(ML_model_parameter_scores))] )
    
    print('調參完成 最佳參數為 : ' , tunelist_tfidf[np.argmax(np.array(tfidf_parameter_best_scores))] ,  ML_model_best_scores_parameter[np.argmax(np.array(tfidf_parameter_best_scores))] )
    return tunelist_tfidf[np.argmax(np.array(tfidf_parameter_best_scores))] , ML_model_best_scores_parameter[np.argmax(np.array(tfidf_parameter_best_scores))]

In [53]:
path = 'C:/Users/jeffh/Desktop/Sdata/'

# 第一種 單個DT 單純2020資料

## TF-IDF

fit : 

* 2020 train label - True

transform : 

* 2020 train label - True & False

### Train data

In [54]:
trainpath=path+'task1_training.xlsx'

In [55]:
train_ = pd.read_excel(trainpath)

In [56]:
train , train_label = df_(train_)

### Test data

In [57]:
testpath = path+'task1_validation.xlsx'

In [58]:
test_ = pd.read_excel(testpath)

In [59]:
test , test_label = df_(test_)

In [60]:
print('train - label 1   : ' , len(train[train.label==1]))
print('train - label 0   : ' , len(train[train.label==0]))
print('dev   - label 1   : ' , len(test[test.label==1]))
print('dev   - label 0   : ' , len(test[test.label==0]))

train - label 1   :  146
train - label 0   :  55273
dev   - label 1   :  35
dev   - label 0   :  13818


### train & finetune Model

In [91]:
tfidf_parameters = { 
'min_df': [2,3],
'max_df' : [5,10,15,20],
'ngram_range' : [(1,2),(1,3),(1,4)]
}

ML_model_parameters = { 
'min_samples_split': [0.1,0.01,0.001,0.0001],
'min_samples_leaf' : [1,2], 
}

ML_model = DecisionTreeClassifier(criterion='gini',max_depth=None,min_samples_split = 0.1,max_features=None,random_state=7)
#ML_model = DecisionTreeClassifier(criterion='gini',max_depth=None,min_samples_split=0.1,max_features=None,random_state=7)

In [92]:
tfidf_want_parameters , model_want_parameters = ML_method_one( tfidf_parameters , ML_model , ML_model_parameters , 4  , train , train_label  )

tfidf para :  {'min_df': 2, 'max_df': 5, 'ngram_range': (1, 2)}
Model para :  {'min_samples_split': 0.1, 'min_samples_leaf': 1}
	第1次kfold，label 1 - F1 score : 0.52
	第2次kfold，label 1 - F1 score : 0.41667
	第3次kfold，label 1 - F1 score : 0.31111
	第4次kfold，label 1 - F1 score : 0.5098
	此參數kfold平均為0.43939500000000004
tfidf para :  {'min_df': 2, 'max_df': 5, 'ngram_range': (1, 2)}
Model para :  {'min_samples_split': 0.1, 'min_samples_leaf': 2}
	第1次kfold，label 1 - F1 score : 0.15
	第2次kfold，label 1 - F1 score : 0.31111
	第3次kfold，label 1 - F1 score : 0.05128
	第4次kfold，label 1 - F1 score : 0.2
	此參數kfold平均為0.17809750000000002
tfidf para :  {'min_df': 2, 'max_df': 5, 'ngram_range': (1, 2)}
Model para :  {'min_samples_split': 0.01, 'min_samples_leaf': 1}
	第1次kfold，label 1 - F1 score : 0.5098
	第2次kfold，label 1 - F1 score : 0.38462
	第3次kfold，label 1 - F1 score : 0.31111
	第4次kfold，label 1 - F1 score : 0.5098
	此參數kfold平均為0.4288325
tfidf para :  {'min_df': 2, 'max_df': 5, 'ngram_range': (1, 2)}
Model para

In [93]:
tfidf_want_parameters

{'min_df': 2, 'max_df': 15, 'ngram_range': (1, 3)}

In [94]:
model_want_parameters

{'min_samples_split': 0.1, 'min_samples_leaf': 1}

In [96]:
tfidf_want_parameters = {'min_df': 2, 'max_df': 15, 'ngram_range': (1, 3)}

In [97]:
model_want_parameters = {'min_samples_split': 0.1, 'min_samples_leaf': 1}

In [98]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=2,max_df=15, norm='l2', encoding='utf-8', ngram_range=(1, 2), stop_words='english',max_features =None)
ML_model= DecisionTreeClassifier(criterion='gini',max_depth=None,min_samples_split=1e-07,min_samples_leaf= 1,max_features=None,random_state=7)

tfidf.set_params( **tfidf_want_parameters )
ML_model.set_params( **model_want_parameters )

tfidf.fit(train.loc[np.where(train_label.values==1)[0]].tweet.apply(lambda x: np.str_(x)) ) 
tfidf_train = tfidf.transform(train.tweet.apply(lambda x: np.str_(x))) 
tfidf_test = tfidf.transform(test.tweet.apply(lambda x: np.str_(x)))

ML_model.fit( tfidf_train , train_label )
pred = ML_model.predict( tfidf_test )

print( classification_report(test_label,pred,digits=5).split()[12])  
print( classification_report(test_label,pred,digits=5) )  

0.76667
              precision    recall  f1-score   support

           0    0.99913   0.99986   0.99949     13818
           1    0.92000   0.65714   0.76667        35

   micro avg    0.99899   0.99899   0.99899     13853
   macro avg    0.95957   0.82850   0.88308     13853
weighted avg    0.99893   0.99899   0.99891     13853



In [99]:
train_words = tfidf.get_feature_names()

print("feature:",len(train_words))
print('extract_word:', train_words) 

feature: 222
extract_word: ['1st', 'acetaminophen', 'adderall', 'af', 'amp drink', 'amp im', 'antibiotics', 'arm', 'asleep', 'baby', 'bad', 'bc', 'bed', 'best', 'best friend', 'better', 'birth', 'birth control', 'botox', 'bottle', 'bottle tums', 'bring', 'button', 'canadiankelli', 'castor', 'castor oil', 'cause', 'chloemugg', 'contractions', 'control', 'daily', 'day', 'diabetics', 'diclectin', 'did', 'didnt', 'die', 'doctor', 'does', 'dont', 'drink', 'eat', 'effects', 'end', 'entonox', 'epidural', 'extra', 'fact', 'faster', 'fat', 'feel', 'feel like', 'feel sick', 'feeling', 'finally', 'fine', 'flu', 'friend', 'fucking', 'gain', 'gaviscon', 'getting', 'getting nexplanon', 'giving', 'gmo', 'gmo insulin', 'god', 'goes', 'going', 'gonna', 'good', 'got', 'great', 'hair', 'happy', 'hard', 'hate', 'havent', 'having', 'head', 'headache', 'help', 'helps', 'hey', 'high', 'home', 'hope', 'hour', 'hours', 'http', 'https', 'hurt', 'husband', 'ibuprofen', 'ill', 'im getting', 'im getting nexplanon'

# 第一種 單個DT 單純2020資料 但看過dev data

## TF-IDF

fit : 

* 2020 train label - True
* 2020 val label - True

transform : 

* 2020 train label - True & False
* 2020 val label - True & False

### Train data

In [100]:
trainpath=path+'task1_all.xlsx'

In [101]:
train_ = pd.read_excel(trainpath)

In [102]:
train , train_label = df_(train_)

### Test data

In [103]:
testpath = path+'task1_validation.xlsx'

In [104]:
test_ = pd.read_excel(testpath)

In [105]:
test , test_label = df_(test_)

In [106]:
print('train - label 1   : ' , len(train[train.label==1]))
print('train - label 0   : ' , len(train[train.label==0]))
print('dev   - label 1   : ' , len(test[test.label==1]))
print('dev   - label 0   : ' , len(test[test.label==0]))

train - label 1   :  181
train - label 0   :  69091
dev   - label 1   :  35
dev   - label 0   :  13818


### train & finetune Model

In [112]:
tfidf_parameters = { 
'min_df': [2,3],
'max_df' : [5,10,15],
'ngram_range' : [(1,2),(1,3),(1,4)]
}

ML_model_parameters = { 
'min_samples_split': [0.1,0.01,0.001,0.0001],
'min_samples_leaf' : [1,2], 
}
ML_model= DecisionTreeClassifier(criterion='gini',max_depth=None,min_samples_split=0.1,max_features=None,random_state=7)

In [113]:
tfidf_want_parameters , model_want_parameters = ML_method_one( tfidf_parameters , ML_model , ML_model_parameters , 4  , train , train_label  )

tfidf para :  {'min_df': 2, 'max_df': 5, 'ngram_range': (1, 2)}
Model para :  {'min_samples_split': 0.1, 'min_samples_leaf': 1}
	第1次kfold，label 1 - F1 score : 0.55385
	第2次kfold，label 1 - F1 score : 0.41379
	第3次kfold，label 1 - F1 score : 0.53125
	第4次kfold，label 1 - F1 score : 0.59375
	此參數kfold平均為0.52316
tfidf para :  {'min_df': 2, 'max_df': 5, 'ngram_range': (1, 2)}
Model para :  {'min_samples_split': 0.1, 'min_samples_leaf': 2}
	第1次kfold，label 1 - F1 score : 0.23077
	第2次kfold，label 1 - F1 score : 0.2963
	第3次kfold，label 1 - F1 score : 0.29091
	第4次kfold，label 1 - F1 score : 0.36364
	此參數kfold平均為0.29540500000000003
tfidf para :  {'min_df': 2, 'max_df': 5, 'ngram_range': (1, 2)}
Model para :  {'min_samples_split': 0.01, 'min_samples_leaf': 1}
	第1次kfold，label 1 - F1 score : 0.55385
	第2次kfold，label 1 - F1 score : 0.41379
	第3次kfold，label 1 - F1 score : 0.53125
	第4次kfold，label 1 - F1 score : 0.59375
	此參數kfold平均為0.52316
tfidf para :  {'min_df': 2, 'max_df': 5, 'ngram_range': (1, 2)}
Model para :

In [114]:
tfidf_want_parameters

{'min_df': 2, 'max_df': 15, 'ngram_range': (1, 3)}

In [115]:
model_want_parameters

{'min_samples_split': 0.1, 'min_samples_leaf': 1}

In [116]:
tfidf = TfidfVectorizer( norm='l2' , encoding='utf-8' , stop_words='english')
ML_model= DecisionTreeClassifier(criterion='gini',max_depth=None,min_samples_split=1e-07,min_samples_leaf= 1,max_features=None,random_state=7)

tfidf.set_params( **tfidf_want_parameters )
ML_model.set_params( **model_want_parameters )

tfidf.fit(train.loc[np.where(train_label.values==1)[0]].tweet.apply(lambda x: np.str_(x)) ) 
tfidf_train = tfidf.transform(train.tweet.apply(lambda x: np.str_(x))) 
tfidf_test = tfidf.transform(test.tweet.apply(lambda x: np.str_(x)))

ML_model.fit( tfidf_train , train_label )
pred = ML_model.predict( tfidf_test )

print( classification_report(test_label,pred,digits=5).split()[12])  
print( classification_report(test_label,pred,digits=5) )  

0.86154
              precision    recall  f1-score   support

           0    0.99949   0.99986   0.99967     13818
           1    0.93333   0.80000   0.86154        35

   micro avg    0.99935   0.99935   0.99935     13853
   macro avg    0.96641   0.89993   0.93061     13853
weighted avg    0.99933   0.99935   0.99933     13853



In [117]:
train_words = tfidf.get_feature_names()

print("feature:",len(train_words))
print('extract_word:', train_words) 

feature: 288
extract_word: ['1st', 'acetaminophen', 'adderall', 'af', 'alarm', 'amp drink', 'amp hospice', 'amp hospice taking', 'amp im', 'antibiotics', 'antibiotics amp', 'anxiety', 'anxiety meds', 'arm', 'asked', 'asleep', 'away', 'away adderall', 'baby', 'bad', 'barely', 'bc', 'bed', 'best', 'best friend', 'better', 'birth', 'birth control', 'blood', 'botox', 'bottle', 'bottle tums', 'bring', 'button', 'came', 'canadiankelli', 'case', 'castor', 'castor oil', 'cause', 'chloemugg', 'cold', 'contractions', 'control', 'cough', 'currently', 'currently morphine', 'currently morphine drip', 'daily', 'damn', 'day', 'days', 'delicious', 'diabetics', 'diclectin', 'did', 'didnt', 'die', 'doctor', 'does', 'dont need', 'dont work', 'dose', 'drink', 'drip', 'drip amp', 'drip amp hospice', 'eat', 'effect', 'effects', 'end', 'ended', 'entonox', 'epidural', 'extra', 'fact', 'faster', 'fat', 'feel', 'feel like', 'feel sick', 'feeling', 'finally', 'fine', 'flu', 'flu medication', 'friend', 'fucking',

# 第二種 單個DT 單純2020資料transform 但fit時學了2018 label True

## TF-IDF

fit : 

* 2020 train label - True
* 2018 label - True

transform : 

* 2020 train label - True & False

In [118]:
trainpath2018 = path+'AnnotationDRUGSInTweets_EMNLPChallenge18_TrainingSetClean.csv' 
trainpathori = path+'task1_training.csv'
devpath = path+'task1_validation.csv'

train2018=pd.read_csv(trainpath2018)
trainori=pd.read_csv(trainpathori)
dev=pd.read_csv(devpath)
# trainori = pd.concat([trainori,dev])

#trainori = trainori.rename(columns={"class": "label"})

train2018 = train2018.rename(columns={"comment_text": "tweet"})
train2018 = train2018.rename(columns={"toxic": "class"})
train2018one = train2018[train2018['class']==1]
train2018zero = train2018[train2018['class']==0]

In [119]:
train , train_label = df_(trainori)

In [120]:
train2018one.shape

(4975, 3)

In [121]:
train2018zero.shape

(4647, 3)

In [122]:
fit_train_ = pd.concat([trainori,train2018one])

In [123]:
fit_train , fit_train_label = df_(fit_train_)

In [124]:
fit_train.shape

(60394, 2)

In [125]:
print('train - label 1   : ' , len(train[train.label==1]))
print('train - label 0   : ' , len(train[train.label==0]))
print('fit train - label 1   : ' , len(fit_train[fit_train.label==1]) , ' = 146 + 4975')
print('dev   - label 1   : ' , len(test[test.label==1]))
print('dev   - label 0   : ' , len(test[test.label==0]))

train - label 1   :  146
train - label 0   :  55273
fit train - label 1   :  5121  = 146 + 4975
dev   - label 1   :  35
dev   - label 0   :  13818


In [127]:
tfidf_want_parameters =  {'min_df': 2, 'max_df': 15, 'ngram_range': (1, 3)}
model_want_parameters =  {'min_samples_split': 0.1, 'min_samples_leaf': 1}

In [128]:
tfidf = TfidfVectorizer( norm='l2' , encoding='utf-8' , stop_words='english')
ML_model= DecisionTreeClassifier(criterion='gini',max_depth=None,min_samples_split=1e-07,min_samples_leaf= 1,max_features=None,random_state=7)

tfidf.set_params( **tfidf_want_parameters )
ML_model.set_params( **model_want_parameters )

tfidf.fit(fit_train.loc[np.where(fit_train_label.values==1)[0]].tweet.apply(lambda x: np.str_(x)) ) 
tfidf_train = tfidf.transform(train.tweet.apply(lambda x: np.str_(x))) 
tfidf_test = tfidf.transform(test.tweet.apply(lambda x: np.str_(x)))

ML_model.fit( tfidf_train , train_label )
pred = ML_model.predict( tfidf_test )

print( classification_report(test_label,pred,digits=5).split()[12])  
print( classification_report(test_label,pred,digits=5) )  

0.74576
              precision    recall  f1-score   support

           0    0.99906   0.99986   0.99946     13818
           1    0.91667   0.62857   0.74576        35

   micro avg    0.99892   0.99892   0.99892     13853
   macro avg    0.95786   0.81421   0.87261     13853
weighted avg    0.99885   0.99892   0.99882     13853



In [21]:
tfidf_parameters = { 
'min_df': [2,3,4,5],
'max_df' : [10,11,12,13,14,15,30],
'ngram_range' : [(1,2),(1,3),(1,4)]
}

ML_model_parameters = { 
'min_samples_split': [1e-03,1e-07],
'min_samples_leaf' : [1], 
}

ML_model= DecisionTreeClassifier(criterion='gini',max_depth=None,min_samples_split=0.1,max_features=None,random_state=7)
cv_times = 4


In [22]:
tfidf = TfidfVectorizer( norm='l2' , encoding='utf-8' , stop_words='english')
kf = StratifiedKFold( n_splits = cv_times )
kf.get_n_splits(train,train_label)

tunelist_tfidf = [ dict( zip( tfidf_parameters , v ) ) for v in product ( *tfidf_parameters.values() ) ]
tunelist_ML_model = [ dict( zip( ML_model_parameters , v ) ) for v in product ( *ML_model_parameters.values() ) ]

tfidf_parameter_best_scores = []
ML_model_best_scores_parameter = []

for tfidf_parameter in tunelist_tfidf :

    tfidf.set_params( **tfidf_parameter )
    ML_model_parameter_scores = []

    for ML_model_parameter in tunelist_ML_model :

        ML_model.set_params( **ML_model_parameter )
        cv_scores_just_care_label_1_F1 = []

        num = 1

        print( 'tfidf para : ' , tfidf_parameter )
        print( 'Model para : ' , ML_model_parameter)

        for train_index , test_index in kf.split(train,train_label):

            cv_train , cv_test = train.loc[train_index], train.loc[test_index]
            cv_train_label , cv_test_label = train_label[train_index], train_label[test_index]

            tfidf.fit(fit_train.loc[np.where(fit_train_label.values==1)[0]].tweet.apply(lambda x: np.str_(x)) ) 

            cv_train = tfidf.transform( cv_train.tweet.apply( lambda x: np.str_(x) ) ) 
            cv_test = tfidf.transform( cv_test.tweet.apply( lambda x: np.str_(x) ) )

            ML_model.fit(cv_train,cv_train_label,)
            cv_pred = ML_model.predict(cv_test)

            cv_scores_just_care_label_1_F1.append( float(classification_report(cv_test_label,cv_pred,digits=5).split()[12]) )
            now_score = float(classification_report(cv_test_label,cv_pred,digits=5).split()[12])
            print('\t第'+str(num)+'次kfold，label 1 - F1 score : ' + str(now_score))

            num+=1

        ML_model_parameter_scores.append(np.mean(cv_scores_just_care_label_1_F1))
        print('\t此參數kfold平均為'+str(np.mean(cv_scores_just_care_label_1_F1)))
    tfidf_parameter_best_scores.append( np.max( np.array(ML_model_parameter_scores) ) )
    ML_model_best_scores_parameter.append(tunelist_ML_model[ np.argmax(np.array(ML_model_parameter_scores))] )

print('調參完成 最佳參數為 : ' , tunelist_tfidf[np.argmax(np.array(tfidf_parameter_best_scores))] ,  ML_model_best_scores_parameter[np.argmax(np.array(tfidf_parameter_best_scores))] )
tfidf_want_parameters , model_want_parameters = tunelist_tfidf[np.argmax(np.array(tfidf_parameter_best_scores))] ,  ML_model_best_scores_parameter[np.argmax(np.array(tfidf_parameter_best_scores))]

tfidf para :  {'min_df': 2, 'max_df': 10, 'ngram_range': (1, 2)}
Model para :  {'min_samples_split': 0.001, 'min_samples_leaf': 1}
	第1次kfold，label 1 - F1 score : 0.64516
	第2次kfold，label 1 - F1 score : 0.54545
	第3次kfold，label 1 - F1 score : 0.65574
	第4次kfold，label 1 - F1 score : 0.75862
	此參數kfold平均為0.6512425
tfidf para :  {'min_df': 2, 'max_df': 10, 'ngram_range': (1, 2)}
Model para :  {'min_samples_split': 1e-07, 'min_samples_leaf': 1}
	第1次kfold，label 1 - F1 score : 0.61765
	第2次kfold，label 1 - F1 score : 0.47273
	第3次kfold，label 1 - F1 score : 0.61765
	第4次kfold，label 1 - F1 score : 0.66667
	此參數kfold平均為0.5936750000000001
tfidf para :  {'min_df': 2, 'max_df': 10, 'ngram_range': (1, 3)}
Model para :  {'min_samples_split': 0.001, 'min_samples_leaf': 1}
	第1次kfold，label 1 - F1 score : 0.63492
	第2次kfold，label 1 - F1 score : 0.52632
	第3次kfold，label 1 - F1 score : 0.65574
	第4次kfold，label 1 - F1 score : 0.74576
	此參數kfold平均為0.640685
tfidf para :  {'min_df': 2, 'max_df': 10, 'ngram_range': (1, 3)}


In [23]:
tfidf = TfidfVectorizer( norm='l2' , encoding='utf-8' , stop_words='english')
ML_model= DecisionTreeClassifier(criterion='gini',max_depth=None,min_samples_split=1e-07,min_samples_leaf= 1,max_features=None,random_state=7)

tfidf.set_params( **tfidf_want_parameters )
ML_model.set_params( **model_want_parameters )

tfidf.fit(fit_train.loc[np.where(fit_train_label.values==1)[0]].tweet.apply(lambda x: np.str_(x)) ) 
tfidf_train = tfidf.transform(train.tweet.apply(lambda x: np.str_(x))) 
tfidf_test = tfidf.transform(test.tweet.apply(lambda x: np.str_(x)))

ML_model.fit( tfidf_train , train_label )
pred = ML_model.predict( tfidf_test )

print( classification_report(test_label,pred,digits=5).split()[12])  
print( classification_report(test_label,pred,digits=5) )  

0.70968
              precision    recall  f1-score   support

           0    0.99906   0.99964   0.99935     13818
           1    0.81481   0.62857   0.70968        35

   micro avg    0.99870   0.99870   0.99870     13853
   macro avg    0.90694   0.81410   0.85451     13853
weighted avg    0.99859   0.99870   0.99862     13853



In [216]:
train_words = tfidf.get_feature_names()

print("feature:",len(train_words))
print("shape:",train_features.shape)
print('extract_word:', train_words) 

feature: 246
shape: (58937, 237)
extract_word: ['1st', 'acetaminophen', 'adderall', 'advil', 'af', 'amp drink', 'amp im', 'anti', 'antibiotics', 'arm', 'arnica', 'asleep', 'baby', 'bad', 'bc', 'bed', 'benadryl', 'best', 'best friend', 'better', 'birth', 'birth control', 'botox', 'bottle', 'bottle tums', 'bring', 'button', 'cake', 'canadiankelli', 'castor', 'castor oil', 'cause', 'chloemugg', 'comfort', 'contractions', 'control', 'daily', 'day', 'diabetics', 'diclectin', 'did', 'didnt', 'die', 'doctor', 'does', 'don', 'dont', 'drink', 'drugs', 'eat', 'effects', 'end', 'entonox', 'epidural', 'extra', 'fact', 'faster', 'fat', 'feel', 'feel like', 'feel sick', 'feeling', 'finally', 'fine', 'flu', 'friend', 'fucking', 'gain', 'gain weight', 'gaviscon', 'getting', 'getting nexplanon', 'ginger', 'giving', 'gmo', 'gmo insulin', 'god', 'goes', 'going', 'gonna', 'good', 'got', 'great', 'hair', 'happy', 'hard', 'hate', 'havent', 'having', 'head', 'headache', 'help', 'helps', 'hey', 'high', 'home'

# 第三種 552個DT fit時學了2018 label True transform的時候 包含146 train - true & 9筆 2018 data 

## TF-IDF

fit : 

* 2020 train label - True

transform : 

* 2020 train label - True & False
* 2018 label - True & False

In [24]:
trainpath2018 = path+'AnnotationDRUGSInTweets_EMNLPChallenge18_TrainingSetClean.csv' 
trainpathori = path+'task1_training.csv'
devpath = path+'task1_validation.csv'

train2018=pd.read_csv(trainpath2018)
trainori=pd.read_csv(trainpathori)
dev=pd.read_csv(devpath)
# trainori = pd.concat([trainori,dev])

#trainori = trainori.rename(columns={"class": "label"})

train2018 = train2018.rename(columns={"comment_text": "tweet"})
train2018 = train2018.rename(columns={"toxic": "class"})
train2018one = train2018[train2018['class']==1]
train2018zero = train2018[train2018['class']==0]

In [25]:
print(train2018one.shape[0])

4975


In [26]:
print(train2018zero.shape[0])

4647


In [27]:
(trainori[trainori['class']==0].shape[0]+train2018zero.shape[0])*0.0026

155.792

In [28]:
155-146

9

### Train data

In [8]:
trainpath=path+'task1_training.xlsx'

In [9]:
train_ = pd.read_excel(trainpath)

In [10]:
train , train_label = df_(train_)

In [12]:
tfidf_want_parameters = {'min_df': 2, 'max_df': 11, 'ngram_range': (1, 3)}

In [22]:
model_want_parameters = {'min_samples_split': 0.001, 'min_samples_leaf': 1}

In [23]:
devscore = [0 for i in range(len(dev))]

In [52]:
add_num = 9
len(train2018one)//add_num

552

In [53]:
list_pred_1 = []

for time in range(len(train2018one)//add_num):
    print('第 '+str(time+1)+' 個Model : ')

    nine2018=train2018one[time*add_num:(time+1)*add_num]
    trainnew=pd.concat([trainori,nine2018,train2018zero])

    model = DecisionTreeClassifier(criterion='gini',max_depth=None,min_samples_split = 0.1,max_features=None,random_state=7)
    
    traindf,traindf_label = df_(trainnew)

    tfidf = TfidfVectorizer( norm='l2' , encoding='utf-8' , stop_words='english')
    tfidf.set_params( **tfidf_want_parameters )
    model.set_params( **model_want_parameters )
    
    print('fit label - True 數量 :' , len(traindf[traindf['label']==1].tweet.apply(lambda x: np.str_(x))))
    tfidf.fit(traindf[traindf['label']==1].tweet.apply(lambda x: np.str_(x))) 

    train_features = tfidf.transform(traindf.tweet.apply(lambda x: np.str_(x)))  #You need to convert the dtype object to unicode string as is clearly mentioned in the traceback.
    train_labels = traindf.label
    train_words = tfidf.get_feature_names()
#     print(len(train_words))
#     print(train_features.shape)

    model.fit(train_features,train_labels)

    devdf , val_labels  = df_(dev)

    val_features = tfidf.transform(devdf.tweet.apply(lambda x: np.str_(x)))  #You need to convert the dtype object to unicode string as is clearly mentioned in the traceback.
    val_words = tfidf.get_feature_names()

#     print(len(val_words))
#     print(val_features.shape)

    ans=model.predict(val_features)
    print(classification_report(val_labels,ans,digits=5))
    print(np.where(np.array(ans)==1)[0])
    list_pred_1 
    
    for i in np.where(ans==1)[0]:
        devscore[i]+=1

第 1 個Model : 
fit label - True 數量 : 155
              precision    recall  f1-score   support

           0    0.99884   0.99964   0.99924     13818
           1    0.79167   0.54286   0.64407        35

   micro avg    0.99848   0.99848   0.99848     13853
   macro avg    0.89525   0.77125   0.82165     13853
weighted avg    0.99832   0.99848   0.99834     13853

[  80 1869 1901 1951 2439 2785 3651 3706 3970 3989 4930 5336 5507 6686
 7130 7308 7792 7906 8527 8728 8762 8820 9280 9431]
第 2 個Model : 
fit label - True 數量 : 155
              precision    recall  f1-score   support

           0    0.99892   0.99964   0.99928     13818
           1    0.80000   0.57143   0.66667        35

   micro avg    0.99856   0.99856   0.99856     13853
   macro avg    0.89946   0.78553   0.83297     13853
weighted avg    0.99841   0.99856   0.99844     13853

[  80 1640 1869 1901 1951 2439 2785 2977 3651 3706 3989 4930 5336 5507
 6686 7130 7308 7792 7906 8527 8728 8762 8820 9280 9431]
第 3 個Model : 
f

In [54]:
len(devscore)

13853

In [55]:
len(dev)*0.0026

36.0178

In [58]:
for threshold in range(1,len(dev)+1):
    devans=[0 for i in range(len(dev))]
    num = 0
    for i in range(len(devscore)):
        if(devscore[i]>threshold):
            devans[i]=1
            num+=1
        else : continue
    if(num==36):
        print(threshold)

7
8


In [77]:
devans=[0 for i in range(len(dev))]
for i in range(len(devscore)):
    if(devscore[i]>551):
        devans[i]=1
    else : continue

print(classification_report(val_labels,devans,digits=5))

              precision    recall  f1-score   support

           0    0.99877   0.99986   0.99931     13818
           1    0.90000   0.51429   0.65455        35

   micro avg    0.99863   0.99863   0.99863     13853
   macro avg    0.94939   0.75707   0.82693     13853
weighted avg    0.99852   0.99863   0.99844     13853



# 濾過的2018

In [7]:
# trainpath2018 =path+'gooddata2(2018)SVM.csv'
trainpath2018 =path+'new_svm_filter_2018.csv' 
trainpathori =path+'task1_training.csv'
devpath =path+'task1_validation.csv'

train2018=pd.read_csv(trainpath2018)
trainori=pd.read_csv(trainpathori)
dev=pd.read_csv(devpath)


train2018 = train2018.rename(columns={"comment_text": "tweet"})
train2018 = train2018.rename(columns={"toxic": "class"})
train2018one=train2018[train2018['class']==1]
train2018zero=train2018[train2018['class']==0]

In [8]:
train2018one.shape

(532, 4)

In [9]:
train2018zero.shape

(4593, 4)

In [10]:
(trainori[trainori['class']==0].shape[0]+train2018zero.shape[0])*0.0026

155.6516

In [11]:
152-146

6

In [16]:
devscore=[0 for i in range(len(dev))]

In [12]:
add_num = 9
len(train2018one)//add_num

59

In [13]:
tfidf_want_parameters = {'min_df': 2, 'max_df': 11, 'ngram_range': (1, 3)}

In [14]:
model_want_parameters = {'min_samples_split': 0.001, 'min_samples_leaf': 1}

In [17]:
list_pred_1 = []

for time in range(len(train2018one)//add_num):
    print('第 '+str(time+1)+' 個Model : ')

    nine2018=train2018one[time*add_num:(time+1)*add_num]
    trainnew=pd.concat([trainori,nine2018,train2018zero])

    model = DecisionTreeClassifier(criterion='gini',max_depth=None,min_samples_split=1e-07, min_samples_leaf=1,max_features=None ,max_leaf_nodes=100,random_state=7)
    
    traindf , traindf_label = df_(trainnew)


    tfidf = TfidfVectorizer( norm='l2' , encoding='utf-8' , stop_words='english')
    tfidf.set_params( **tfidf_want_parameters )
    model.set_params( **model_want_parameters )
    
    tfidf.fit(traindf[traindf['label']==1].tweet.apply(lambda x: np.str_(x))) 

    train_features = tfidf.transform(traindf.tweet.apply(lambda x: np.str_(x)))  #You need to convert the dtype object to unicode string as is clearly mentioned in the traceback.
    train_labels = traindf.label
    train_words = tfidf.get_feature_names()
#     print(len(train_words))
#     print(train_features.shape)

    model.fit(train_features,train_labels)

    devdf , devdf_label = df_(dev)

    val_features = tfidf.transform(devdf.tweet.apply(lambda x: np.str_(x)))  #You need to convert the dtype object to unicode string as is clearly mentioned in the traceback.
    val_labels = devdf.label
    val_words = tfidf.get_feature_names()

#     print(len(val_words))
#     print(val_features.shape)

    ans=model.predict(val_features)
    print(classification_report(val_labels,ans,digits=5))
    print(np.where(np.array(ans)==1)[0])
    list_pred_1 
    
    for i in np.where(ans==1)[0]:
        devscore[i]+=1

第 1 個Model : 
              precision    recall  f1-score   support

           0    0.99906   0.99971   0.99939     13818
           1    0.84615   0.62857   0.72131        35

   micro avg    0.99877   0.99877   0.99877     13853
   macro avg    0.92261   0.81414   0.86035     13853
weighted avg    0.99867   0.99877   0.99868     13853

[  80 1869 1951 2439 2785 2977 3651 3706 3989 4218 4930 5336 5507 6686
 7051 7130 7308 7537 7792 7906 8527 8728 8762 8820 9280 9431]
第 2 個Model : 
              precision    recall  f1-score   support

           0    0.99913   0.99971   0.99942     13818
           1    0.85185   0.65714   0.74194        35

   micro avg    0.99885   0.99885   0.99885     13853
   macro avg    0.92549   0.82843   0.87068     13853
weighted avg    0.99876   0.99885   0.99877     13853

[  80  155 1869 1951 2439 2785 2977 3651 3706 3989 4218 4930 5336 5507
 6686 7051 7130 7308 7537 7792 7906 8527 8728 8762 8820 9280 9431]
第 3 個Model : 
              precision    recall

In [22]:
for threshold in range(1,len(dev)+1):
    devans=[0 for i in range(len(dev))]
    num = 0
    for i in range(len(devscore)):
        if(devscore[i]>threshold):
            devans[i]=1
            num+=1
        else : continue
    if(num==36):
        print(threshold)

In [30]:
devans=[0 for i in range(len(dev))]
for i in range(len(devscore)):
    if(devscore[i]>10):
        devans[i]=1
    else : continue

print(classification_report(val_labels,devans,digits=5))

              precision    recall  f1-score   support

           0    0.99913   0.99971   0.99942     13818
           1    0.85185   0.65714   0.74194        35

   micro avg    0.99885   0.99885   0.99885     13853
   macro avg    0.92549   0.82843   0.87068     13853
weighted avg    0.99876   0.99885   0.99877     13853

