![steps for solution](steps-slide.png)

# The purpose of this example is to predict if the Super Hero Creator is "Marvel Comics" or not, based on the Super Hero "History" column

In [45]:
import numpy as np
import pandas as pd

# Load the dataset from file into memory

In [190]:
dataset_superheros = pd.read_csv('super-heros-dataset/superheroes_nlp_dataset.csv')

In [191]:
print(f'Loaded dataset with {dataset_superheros.shape[0]} records with {dataset_superheros.shape[1]} columns')

Loaded dataset with 1450 records with 81 columns


# Create target column and make some adjustments in the dataset

In [None]:
def normalize_target_column(x):
    return 1 if x == 'Marvel Comics' else 0

In [None]:
dataset_superheros['creator_normalized'] = dataset_superheros['creator'].apply(lambda x: normalize_target_column(x))

# Split the dataset to train / test data

In [None]:
from sklearn.model_selection import train_test_split
# Split data between training set and test set (80%, 20%)
train, test = train_test_split(dataset_superheros, test_size=0.2, random_state=1)

In [None]:
print(f'Dataset is split into train set of {train.shape[0]} records and {test.shape[0]} records.')

# Remove all the records that their history text is null

In [None]:
# Remove all the records that their history text is null
train_history_text = train[~pd.isnull(train['history_text'])]['history_text']
# Remove all the records that their history text is null
test_history_text = test[~pd.isnull(test['history_text'])]['history_text']

# Transform text to features

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# TF/IDF, N-Grams (Default 1), Stopwords (Not as part of the default)
vectorizer = TfidfVectorizer(stop_words="english")
#vectorizer = TfidfVectorizer(stop_words="english", ngram_range = (2, 3))
#vectorizer = TfidfVectorizer()
#vectorizer = CountVectorizer(stop_words="english")
#vectorizer = CountVectorizer()

In [None]:
TF_IDF_matrix_input_train = vectorizer.fit_transform(train_history_text)
print (f'Train set input has {TF_IDF_matrix_input_train.shape[0]} records and {TF_IDF_matrix_input_train.shape[1]} features.')

In [None]:
TF_IDF_matrix_input_test = vectorizer.transform(test_history_text)
print (f'Test set input has {TF_IDF_matrix_input_test.shape[0]} records and {TF_IDF_matrix_input_test.shape[1]} features.')

In [None]:
feature_names = vectorizer.get_feature_names_out()

# Prepare the expected output for train and test

In [None]:
expected_output_train = train[~pd.isnull(train['history_text'])]['creator_normalized']
expected_output_test = test[~pd.isnull(test['history_text'])]['creator_normalized']

# Train classifier

In [None]:
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression(C=5, max_iter=1000)

In [None]:
classifier.fit(TF_IDF_matrix_input_train, expected_output_train)

In [None]:
predictions = classifier.predict(TF_IDF_matrix_input_test)

# Show Predication Score

In [None]:
from sklearn import metrics

recall = metrics.recall_score(expected_output_test, predictions)

In [None]:
print(recall)

In [None]:
precision = metrics.precision_score(expected_output_test, predictions)

In [None]:
print(precision)

In [None]:
score = metrics.accuracy_score(expected_output_test, predictions)
print(score)