![steps for solution](steps-slide.png)

# The purpose of this example is to predict if the Super Hero Creator is "Marvel Comics" or not, based on the Super Hero "History" column

In [54]:
import pandas as pd

# Load the dataset from file into memory

In [55]:
dataset_superheros = pd.read_csv('super-heros-dataset/superheroes_nlp_dataset.csv')

In [56]:
print(f'Loaded dataset with {dataset_superheros.shape[0]} records with {dataset_superheros.shape[1]} columns')

Loaded dataset with 1450 records with 81 columns


# Generate target column in the data set

In [57]:
def generate_target_column(x):
    return 1 if x == 'Marvel Comics' else 0

In [58]:
dataset_superheros['creator_normalized'] = dataset_superheros['creator'].apply(lambda x: generate_target_column(x))

# Split the dataset to train / test data

In [59]:
from sklearn.model_selection import train_test_split
# Split data between training set and test set (80%, 20%)
train, test = train_test_split(dataset_superheros, test_size=0.2)

In [60]:
print(f'Dataset is split into train set of {train.shape[0]} records and {test.shape[0]} records.')

Dataset is split into train set of 1160 records and 290 records.


# Retrieve features from text columns

In [61]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    stop_words="english"
)

In [62]:
# Remove all the records that their history text is null
train_text_columns = train[~pd.isnull(train['history_text'])]['history_text']

In [63]:
# Remove all the records that their history text is null
test_text_columns = test[~pd.isnull(test['history_text'])]['history_text']

In [64]:
X_train = vectorizer.fit_transform(train_text_columns)

In [65]:
y_train = train[~pd.isnull(train['history_text'])]['creator_normalized']

In [66]:
X_test = vectorizer.transform(test_text_columns)

In [67]:
y_test = test[~pd.isnull(test['history_text'])]['creator_normalized']

In [68]:
feature_names = vectorizer.get_feature_names_out()

# Train classifier

In [27]:
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression(C=5, max_iter=1000)

In [28]:
classifier.fit(X_train, y_train)

In [29]:
predictions = classifier.predict(X_test)

# Show Metrics

In [30]:
from sklearn import metrics

score = metrics.accuracy_score(y_test, predictions)

In [31]:
print(score)

0.9210526315789473


In [32]:
precision = metrics.precision_score(y_test, predictions)

In [33]:
print(precision)

0.9797979797979798
