# Import

In [5]:
import pandas as pd 
import sqlalchemy as db 


In [7]:
engine = db.create_engine('sqlite:///myDB.db')
df = pd.read_sql('select * from myTable',engine)

In [10]:
df.head(3)

Unnamed: 0,id,message,genre,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,Weather update - a cold front from Cuba that c...,direct,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,Is the Hurricane over or is it not over,direct,1,0,0,1,0,0,0,...,0,0,1,0,1,0,0,0,0,0
2,8,Looking for someone but no name,direct,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
X = df['message'].values #will create array of all values in message
Y = df.drop(['id','message','genre'],axis=1)

In [15]:
X 

array(['Weather update - a cold front from Cuba that could pass over Haiti',
       'Is the Hurricane over or is it not over',
       'Looking for someone but no name', ...,
       "Proshika, operating in Cox's Bazar municipality and 5 other unions, Ramu and Chokoria, assessment, 5 kg rice, 1,5 kg lentils to 700 families.",
       'Some 2,000 women protesting against the conduct of the elections were teargassed as they tried to converge on the local electoral commission offices in the southern oil city of Port Harcourt.',
       'A radical shift in thinking came about as a result of this meeting, recognizing that HIV/AIDS is at the core of the humanitarian crisis and identifying the crisis itself as a function of the HIV/AIDS pandemic.'],
      dtype=object)

In [16]:
Y 

Unnamed: 0,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,1,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26381,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26382,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26383,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26384,1,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


# Tokenization

In [17]:
import nltk 
import pickle  
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from nltk.stem import WordNetLemmatizer


[nltk_data] Downloading package stopwords to /home/ind/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [44]:
def tokenize(text):
    token = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    
    clean_token = []
    
    for t in token:
        clean_tok = lemmatizer.lemmatize(t).lower().strip()
        clean_token.append(clean_tok)
        
    return clean_token

- if you dont understand then
- word_tokenize('Hi there')
- ['Hi', 'there']
- Lemmatization is the process of grouping together the different inflected forms of a word so they can be analysed as a single item
- print("rocks:",WordNetLemmatizer().lemmatize("rocks"))
- rocks :rock



# Creating `PIPELINE`

In [45]:
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier

In [46]:
pipeline = Pipeline([
                      ('vector',CountVectorizer(tokenizer=tokenize)),
                      ('tfidf',TfidfTransformer()),
                      ('clf',MultiOutputClassifier(RandomForestClassifier()))
    
                    ])

## Train Test Split

In [47]:
from sklearn.model_selection import train_test_split

In [54]:
# By mistake i have name y_test as t_test

In [48]:
X_train,X_test,y_train,t_test = train_test_split(X,Y,test_size=0.2)

In [49]:
pipeline.fit(X_train,y_train)

Pipeline(steps=[('vector',
                 CountVectorizer(tokenizer=<function tokenize at 0x7f320e04e680>)),
                ('tfidf', TfidfTransformer()),
                ('clf',
                 MultiOutputClassifier(estimator=RandomForestClassifier()))])

# Testing Model RandomForest

In [50]:
y_pred = pipeline.predict(X_test)

In [51]:
y_pred

array([[1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]])

In [53]:
y_pred_df  = pd.DataFrame(y_pred,columns=t_test.columns)

In [57]:
#prediction df
y_pred_df.head(10)

Unnamed: 0,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
6,1,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
7,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Accuracy

In [59]:
from sklearn.metrics import classification_report, accuracy_score

In [64]:
for col in t_test.columns:
    print('*****************************************************************\n')
    print(f'Accuracy {accuracy_score(t_test[col],y_pred_df[col])}')
    print(f'Feature: {col}\n')
    print(classification_report(t_test[col],y_pred_df[col]))

*****************************************************************

Accuracy 0.7997347480106101
Feature: related

              precision    recall  f1-score   support

           0       0.73      0.26      0.38      1232
           1       0.81      0.97      0.88      4017
           2       0.75      0.21      0.32        29

    accuracy                           0.80      5278
   macro avg       0.76      0.48      0.53      5278
weighted avg       0.79      0.80      0.76      5278

*****************************************************************

Accuracy 0.8880257673361122
Feature: request

              precision    recall  f1-score   support

           0       0.89      0.99      0.94      4355
           1       0.88      0.41      0.56       923

    accuracy                           0.89      5278
   macro avg       0.89      0.70      0.75      5278
weighted avg       0.89      0.89      0.87      5278

*****************************************************************


Accuracy 0.9353921940128836
Feature: infrastructure_related

              precision    recall  f1-score   support

           0       0.94      1.00      0.97      4942
           1       0.00      0.00      0.00       336

    accuracy                           0.94      5278
   macro avg       0.47      0.50      0.48      5278
weighted avg       0.88      0.94      0.91      5278

*****************************************************************

Accuracy 0.9577491474043198
Feature: transport

              precision    recall  f1-score   support

           0       0.96      1.00      0.98      5044
           1       0.74      0.07      0.13       234

    accuracy                           0.96      5278
   macro avg       0.85      0.54      0.56      5278
weighted avg       0.95      0.96      0.94      5278

*****************************************************************

Accuracy 0.9533914361500568
Feature: buildings

              precision    recall  f1-score   support



  _warn_prf(average, modifier, msg_start, len(result))


# AdaBoost

In [84]:
from sklearn.ensemble import AdaBoostClassifier

In [85]:
improved_pipeline = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier(AdaBoostClassifier()))
    ])

In [86]:
improved_pipeline.fit(X_train,y_train)

Pipeline(steps=[('vect',
                 CountVectorizer(tokenizer=<function tokenize at 0x7f320e04e680>)),
                ('tfidf', TfidfTransformer()),
                ('clf', MultiOutputClassifier(estimator=AdaBoostClassifier()))])

## Test Model Adaboost

In [87]:
y_pred_ada = improved_pipeline.predict(X_test)

In [89]:
y_pred_df_ada  = pd.DataFrame(y_pred_ada,columns=t_test.columns)

In [90]:
y_pred_df_ada.head(5)

Unnamed: 0,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [91]:
for col in t_test.columns:
    print('*****************************************************************\n')
    print(f'Accuracy {accuracy_score(t_test[col],y_pred_df_ada[col])}')
    print(f'Feature: {col}\n')
    print(classification_report(t_test[col],y_pred_df_ada[col]))

*****************************************************************

Accuracy 0.7807881773399015
Feature: related

              precision    recall  f1-score   support

           0       0.59      0.27      0.37      1232
           1       0.80      0.94      0.87      4017
           2       0.33      0.14      0.20        29

    accuracy                           0.78      5278
   macro avg       0.58      0.45      0.48      5278
weighted avg       0.75      0.78      0.75      5278

*****************************************************************

Accuracy 0.8925729442970822
Feature: request

              precision    recall  f1-score   support

           0       0.91      0.97      0.94      4355
           1       0.78      0.53      0.63       923

    accuracy                           0.89      5278
   macro avg       0.85      0.75      0.79      5278
weighted avg       0.89      0.89      0.88      5278

*****************************************************************


In [92]:
overall_accuracy = (y_pred_ada == t_test).mean().mean()
overall_accuracy

0.9477390425666284

# Pickling

In [93]:
with open('adaboost.pkl','wb') as file: 
    pickle.dump(y_pred_df_ada,file)