In [1]:
# import libraries
from sklearn.metrics import hamming_loss, jaccard_score, f1_score
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from xgboost import XGBClassifier
import nltk
import pickle
nltk.download('punkt') 
import sqlite3
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\letsm005\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# load data from database
#engine = create_engine('sqlite:///InsertDatabaseName.db')
conn = sqlite3.connect('etl_disaster_data.db')

# Read data from SQLite database into a DataFrame
query = "SELECT * FROM etl_disaster_table"
df = pd.read_sql_query(query, conn).head(5000)

# Close the connection
conn.close()

In [6]:
df.dtypes

id               int64
categories      object
category_0       int64
category_1       int64
category_2       int64
category_3       int64
category_4       int64
category_5       int64
category_6       int64
category_7       int64
category_8       int64
category_9       int64
category_10      int64
category_11      int64
category_12      int64
category_13      int64
category_14      int64
category_15      int64
category_16      int64
category_17      int64
category_18      int64
category_19      int64
category_20      int64
category_21      int64
category_22      int64
category_23      int64
category_24      int64
category_25      int64
category_26      int64
category_27      int64
category_28      int64
category_29      int64
category_30      int64
category_31      int64
category_32      int64
category_33      int64
category_34      int64
category_35      int64
message         object
genre_direct     int64
genre_news       int64
genre_social     int64
dtype: object

### 3. Build a machine learning pipeline
This machine pipeline should take in the `message` column as input and output classification results on the other 36 categories in the dataset. You may find the [MultiOutputClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.multioutput.MultiOutputClassifier.html) helpful for predicting multiple target variables.

In [13]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    df['message'], df.drop(["message","id","categories"],axis=1), test_size=0.2, random_state=42
)

In [14]:
# Define pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),  # Text preprocessing
    ('clf', MultiOutputClassifier(RandomForestClassifier()))  # Multi-output classifier
])



In [15]:
### 4. Train pipeline
pipeline.fit(X_train, y_train)

In [16]:
y_pred_ = pipeline.predict(X_test)

In [17]:
y_pred_

array([[0, 0, 0, ..., 1, 0, 0],
       [1, 0, 0, ..., 1, 0, 0],
       [1, 1, 0, ..., 1, 0, 0],
       ...,
       [1, 0, 0, ..., 1, 0, 0],
       [1, 0, 0, ..., 1, 0, 0],
       [1, 0, 0, ..., 1, 0, 0]])

### 5. Test your model
Report the f1 score, precision and recall for each output category of the dataset. You can do this by iterating through the columns and calling sklearn's `classification_report` on each.

In [None]:
y_pred = pipeline.predict(X_test)
# Calculate accuracy for each label
accuracies = [accuracy_score(y_test[label], y_pred[:, idx]) for idx, label in enumerate(y_test.columns)]
print("Accuracy for each label:", accuracies)

# Calculate Hamming Loss for each label
hamming_losses = [hamming_loss(y_test[label], y_pred[:, idx]) for idx, label in enumerate(y_test.columns)]
print("Hamming Loss for each label:", hamming_losses)

# Calculate Jaccard Score for each label
jaccard_scores = [jaccard_score(y_test[label], y_pred[:, idx], average=None) for idx, label in enumerate(y_test.columns)]
print("Jaccard Score for each label:", jaccard_scores)

# Calculate F1 Score for each label
f1_scores = [f1_score(y_test[label], y_pred[:, idx], average=None) for idx, label in enumerate(y_test.columns)]
print("F1 Score for each label:", f1_scores)


In [None]:


# Assuming y_test and y_pred are your true labels and predicted labels respectively

# Flatten y_test and y_pred to fit classification_report
y_test_flat = y_test.values.ravel()
y_pred_flat = y_pred.ravel()

# Generate the classification report
report = classification_report(y_test_flat, y_pred_flat)

print("Classification Report:")
print(report)


### 6. Improve your model
Use grid search to find better parameters. 

In [None]:
param_grid = {
    'tfidf__max_features': [1000, 2000, 3000],  # Number of features to consider
    'tfidf__ngram_range': [(1, 1), (1, 2)],      # Range of n-grams
    'clf__estimator__n_estimators': [100, 200, 300],  # Number of trees in the forest
    'clf__estimator__max_depth': [10, 20, 30],       # Maximum depth of the tree
}

# Perform grid search cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=3, verbose=2, n_jobs=-1)

# Fit grid search on training data
grid_search.fit(X_train, y_train)

# Evaluate performance on test set
best_pipeline = grid_search.best_estimator_
y_pred = best_pipeline.predict(X_test)

In [None]:
print("Optimal parameters :",best_pipeline)

In [None]:
# Define pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=1000, ngram_range=(1, 2))),  # Text preprocessing
    ('clf', MultiOutputClassifier(RandomForestClassifier(
        max_depth=30,n_estimators=200
    )))  # Multi-output classifier
])

In [None]:
### 4. Train pipeline
pipeline.fit(X_train, y_train)

### 7. Test your model
Show the accuracy, precision, and recall of the tuned model.  

Since this project focuses on code quality, process, and  pipelines, there is no minimum performance metric needed to pass. However, make sure to fine tune your models for accuracy, precision and recall to make your project stand out - especially for your portfolio!

In [None]:
y_pred = pipeline.predict(X_test)
# Calculate accuracy for each label
accuracies = [accuracy_score(y_test[label], y_pred[:, idx]) for idx, label in enumerate(y_test.columns)]
print("Accuracy for each label:", accuracies)

# Calculate Hamming Loss for each label
hamming_losses = [hamming_loss(y_test[label], y_pred[:, idx]) for idx, label in enumerate(y_test.columns)]
print("Hamming Loss for each label:", hamming_losses)

# Calculate Jaccard Score for each label
jaccard_scores = [jaccard_score(y_test[label], y_pred[:, idx], average=None) for idx, label in enumerate(y_test.columns)]
print("Jaccard Score for each label:", jaccard_scores)

# Calculate F1 Score for each label
f1_scores = [f1_score(y_test[label], y_pred[:, idx], average=None) for idx, label in enumerate(y_test.columns)]
print("F1 Score for each label:", f1_scores)


In [None]:

# Assuming y_test and y_pred are your true labels and predicted labels respectively

# Flatten y_test and y_pred to fit classification_report
y_test_flat = y_test.values.ravel()
y_pred_flat = y_pred.ravel()

# Generate the classification report
report = classification_report(y_test_flat, y_pred_flat)

print("Classification Report:")
print(report)


### 8. Try improving your model further. Here are a few ideas:
* try other machine learning algorithms
* add other features besides the TF-IDF

In [18]:
# Define pipeline with CountVectorizer and use XGboost classifier
pipeline_v2 = Pipeline([
    ('count_vectorizer', CountVectorizer()),  # Text preprocessing
    ('clf', MultiOutputClassifier(XGBClassifier()))  # Multi-output classifier
])



In [22]:
y_pred = pipeline_v2.predict(X_test)
# Calculate accuracy for each label
accuracies = [accuracy_score(y_test[label], y_pred[:, idx]) for idx, label in enumerate(y_test.columns)]
print("Accuracy for each label:", accuracies)

# Calculate Hamming Loss for each label
hamming_losses = [hamming_loss(y_test[label], y_pred[:, idx]) for idx, label in enumerate(y_test.columns)]
print("Hamming Loss for each label:", hamming_losses)

# Calculate Jaccard Score for each label
jaccard_scores = [jaccard_score(y_test[label], y_pred[:, idx], average=None) for idx, label in enumerate(y_test.columns)]
print("Jaccard Score for each label:", jaccard_scores)

# Calculate F1 Score for each label
f1_scores = [f1_score(y_test[label], y_pred[:, idx], average=None) for idx, label in enumerate(y_test.columns)]
print("F1 Score for each label:", f1_scores)
# Assuming y_test and y_pred are your true labels and predicted labels respectively

# Flatten y_test and y_pred to fit classification_report
y_test_flat = y_test.values.ravel()
y_pred_flat = y_pred.ravel()

# Generate the classification report
report = classification_report(y_test_flat, y_pred_flat)

print("Classification Report:")
print(report)


Accuracy for each label: [0.847, 0.826, 0.997, 0.792, 0.925, 0.954, 0.961, 0.974, 0.993, 1.0, 0.975, 0.954, 0.941, 0.993, 0.986, 0.98, 0.982, 0.964, 0.839, 0.957, 0.966, 0.955, 0.995, 0.997, 0.987, 0.997, 0.994, 0.972, 0.916, 0.97, 0.984, 0.995, 0.977, 0.992, 0.972, 0.817, 0.0, 1.0, 1.0]
Hamming Loss for each label: [0.153, 0.174, 0.003, 0.208, 0.075, 0.046, 0.039, 0.026, 0.007, 0.0, 0.025, 0.046, 0.059, 0.007, 0.014, 0.02, 0.018, 0.036, 0.161, 0.043, 0.034, 0.045, 0.005, 0.003, 0.013, 0.003, 0.006, 0.028, 0.084, 0.03, 0.016, 0.005, 0.023, 0.008, 0.028, 0.183, 1.0, 0.0, 0.0]
Jaccard Score for each label: [array([0.4664311 , 0.82638889, 0.16666667]), array([0.71048253, 0.69633508]), array([0.997, 0.   ]), array([0.6357268 , 0.67346939]), array([0.92323439, 0.23469388]), array([0.95286885, 0.34285714]), array([0.96088265, 0.07142857]), array([0.974, 0.   ]), array([0.993, 0.   ]), array([1.]), array([0.97142857, 0.83333333]), array([0.94155019, 0.82239382]), array([0.93473451, 0.61935484

In [19]:
# Find the minimum value in y_train
min_label = y_train.min()

# Adjust the target labels
y_train_adjusted = y_train - min_label

# Fit the pipeline with the adjusted labels
pipeline_v2.fit(X_train, y_train_adjusted)



In [20]:

# Assuming y_test and y_pred are your true labels and predicted labels respectively

# Flatten y_test and y_pred to fit classification_report
y_test_flat = y_test.values.ravel()
y_pred_flat = y_pred.ravel()

# Generate the classification report
report = classification_report(y_test_flat, y_pred_flat)

print("Classification Report:")
print(report)

NameError: name 'y_pred' is not defined

In [None]:
pipeline = Pipeline([
    ('count_vectorizer', CountVectorizer()),  # Text preprocessing
    ('clf', MultiOutputClassifier(XGBClassifier()))  # Multi-output classifier
])

# Find the minimum value in y_train
min_label = y_train.min()

# Adjust the target labels
y_train_adjusted = y_train - min_label

# Initialize GridSearchCV with adjusted target labels
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=3, scoring='accuracy')

# Perform grid search with adjusted target labels
grid_search.fit(X_train, y_train_adjusted)

# Print best parameters and best score
print("Best Parameters: ", grid_search.best_params_)
print("Best Score: ", grid_search.best_score_)


### 9. Export your model as a pickle file

In [None]:
# Assuming 'pipeline' is your trained pipeline
pipeline.fit(X_train, y_train)  # Train your pipeline if not already done

# Serialize the pipeline using pickle
with open('model.pkl', 'wb') as file:
    pickle.dump(pipeline, file)

### 10. Use this notebook to complete `train_classifier.py`
Use the template file attached in the Resources folder to write a script that runs the steps above to create a database and export a model based on a new dataset specified by the user.