In [16]:
import nltk
import re 
import time
import pandas as pd
import numpy as np


from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download (['punkt', 'wordnet'])


from sqlalchemy import create_engine

pd.set_option('display.max_columns', 100)

[nltk_data] Downloading package punkt to /Users/kofiosei-
[nltk_data]     bonsu/projects/learning/new_ml/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/kofiosei-
[nltk_data]     bonsu/projects/learning/new_ml/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [13]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [43]:
from sklearn.linear_model import LogisticRegression

In [14]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer,classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

In [79]:
# get dataset
engine = create_engine('sqlite:///DisasterResponse.db')
df = pd.read_sql("SELECT * FROM Messages", engine)

In [76]:
# df['related'].value_counts()

len(df[df.related != 2])

10038

In [81]:
value_counts_df = pd.DataFrame({col: y[col].value_counts() for col in y.columns})
value_counts_df


Unnamed: 0,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,3395,6424,10028,6100,9464,9696,9832,9909,9994,10038.0,...,9964,9860,8591,9755,9762,10000,9248,9979,9844,6565
1,6643,3614,10,3938,574,342,206,129,44,,...,74,178,1447,283,276,38,790,59,194,3473


In [70]:
# function to tokenize text

url_regex = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
def tokenize(text):
    detected_urls = re.findall(url_regex, text)
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")
#     remove punctuations
    tokens = word_tokenize(re.sub(r"[^a-zA-Z0-9]", " ", text.lower()))
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).strip()
        clean_tokens.append(clean_tok)

    return clean_tokens

In [80]:
X = df['message']
y = df.iloc[:,4:]

In [49]:
y.head()

Unnamed: 0,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,1,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [63]:
value_counts_df = pd.DataFrame({col: y[col].value_counts() for col in y.columns})
value_counts_df


Unnamed: 0,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,6122,21742.0,26098.0,15356.0,24132.0,24903.0,25492.0,25745.0,25356.0,26216.0,...,25907.0,25065.0,18919.0,24061.0,23773.0,25934.0,23761.0,25686.0,24840.0,21141.0
1,19906,4474.0,118.0,10860.0,2084.0,1313.0,724.0,471.0,860.0,,...,309.0,1151.0,7297.0,2155.0,2443.0,282.0,2455.0,530.0,1376.0,5075.0
2,188,,,,,,,,,,...,,,,,,,,,,


In [64]:
y['related'].value_counts()

related
1    19906
0     6122
2      188
Name: count, dtype: int64

In [40]:
# pipeline

pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize, stop_words='english')),
    ('tfidf', TfidfTransformer()),
     ('clf', MultiOutputClassifier(RandomForestClassifier())
    )])

In [83]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [92]:
df.columns

Index(['id', 'message', 'original', 'genre', 'related', 'request', 'offer',
       'aid_related', 'medical_help', 'medical_products', 'search_and_rescue',
       'security', 'military', 'child_alone', 'water', 'food', 'shelter',
       'clothing', 'money', 'missing_people', 'refugees', 'death', 'other_aid',
       'infrastructure_related', 'transport', 'buildings', 'electricity',
       'tools', 'hospitals', 'shops', 'aid_centers', 'other_infrastructure',
       'weather_related', 'floods', 'storm', 'fire', 'earthquake', 'cold',
       'other_weather', 'direct_report'],
      dtype='object')

In [84]:
len(y_train.columns)

36

In [85]:
pipeline.fit(X_train, y_train)



In [86]:
y_pred = pipeline.predict(X_test)

In [87]:
# Calculate the accuracy for each of them.
for i in range(15):
    print('Category: {} '.format(y.columns[i]))
    print(classification_report(y_test.iloc[:, i].values, y_pred[:, i]))
    print('Accuracy {}\n\n'.format(accuracy_score(y_test.iloc[:, i].values, y_pred[:, i])))
    print('F1 {}\n\n'.format(f1_score(y_test.iloc[:, i].values, y_pred[:, i],average='weighted')))

Category: related 
              precision    recall  f1-score   support

           0       0.70      0.67      0.69       874
           1       0.83      0.85      0.84      1636

    accuracy                           0.79      2510
   macro avg       0.77      0.76      0.76      2510
weighted avg       0.78      0.79      0.79      2510

Accuracy 0.7868525896414342


F1 0.7854170510434411


Category: request 
              precision    recall  f1-score   support

           0       0.87      0.91      0.89      1647
           1       0.81      0.73      0.77       863

    accuracy                           0.85      2510
   macro avg       0.84      0.82      0.83      2510
weighted avg       0.85      0.85      0.85      2510

Accuracy 0.848605577689243


F1 0.8467738244410329


Category: offer 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2506
           1       0.00      0.00      0.00         4

    accuracy        

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [56]:
len(y_pred)

6554

In [90]:
print(classification_report(y_test.iloc[:, 1:].values, np.array([x[1:] for x in y_pred]), target_names = y.columns[:-1]))


                        precision    recall  f1-score   support

               related       0.81      0.73      0.77       863
               request       0.00      0.00      0.00         4
                 offer       0.80      0.75      0.78       937
           aid_related       0.71      0.07      0.12       150
          medical_help       0.67      0.08      0.14        79
      medical_products       1.00      0.02      0.03        58
     search_and_rescue       0.00      0.00      0.00        24
              security       0.00      0.00      0.00         9
              military       0.00      0.00      0.00         0
           child_alone       0.91      0.80      0.85       192
                 water       0.93      0.86      0.89       350
                  food       0.87      0.55      0.67       258
               shelter       0.00      0.00      0.00        23
              clothing       0.00      0.00      0.00        35
                 money       0.00      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [91]:
y.columns

Index(['related', 'request', 'offer', 'aid_related', 'medical_help',
       'medical_products', 'search_and_rescue', 'security', 'military',
       'child_alone', 'water', 'food', 'shelter', 'clothing', 'money',
       'missing_people', 'refugees', 'death', 'other_aid',
       'infrastructure_related', 'transport', 'buildings', 'electricity',
       'tools', 'hospitals', 'shops', 'aid_centers', 'other_infrastructure',
       'weather_related', 'floods', 'storm', 'fire', 'earthquake', 'cold',
       'other_weather', 'direct_report'],
      dtype='object')

In [93]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

In [97]:
# Find the best performing model

classifiers = [
    # RandomForestClassifier(),
    MultinomialNB(),
    # SVC(),
    AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1,class_weight='balanced'))
    # LogisticRegression(max_iter=1000)
]

In [98]:
for classifier in classifiers:
    # Create a new pipeline with the current classifier
    pipeline = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize, stop_words='english')),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier(classifier))  # Replace classifier dynamically
    ])
    
    try:
        # Train the model
        print(f'Training: {classifier}')
        pipeline.fit(X_train, y_train)
        
        # Make predictions
        y_pred = pipeline.predict(X_test)
        
        # Evaluate the model (you can choose any metric)
        print(f"\nModel: {classifier.__class__.__name__}")
        print(classification_report(y_test.iloc[:, 1:].values, np.array([x[1:] for x in y_pred]), 
                                    target_names = y.columns[:-1]))
    except ValueError as e:
        print(f"Error with {classifier.__class__.__name__}: {e}")

Training: MultinomialNB()





Model: MultinomialNB
                        precision    recall  f1-score   support

               related       0.77      0.64      0.70       863
               request       0.00      0.00      0.00         4
                 offer       0.77      0.71      0.73       937
           aid_related       0.00      0.00      0.00       150
          medical_help       0.00      0.00      0.00        79
      medical_products       0.00      0.00      0.00        58
     search_and_rescue       0.00      0.00      0.00        24
              security       0.00      0.00      0.00         9
              military       0.00      0.00      0.00         0
           child_alone       1.00      0.03      0.06       192
                 water       0.89      0.13      0.23       350
                  food       0.75      0.02      0.05       258
               shelter       0.00      0.00      0.00        23
              clothing       0.00      0.00      0.00        35
                 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Error with SVC: The number of classes has to be greater than one; got 1 class
Training: LogisticRegression(max_iter=1000)




Error with LogisticRegression: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0
