![steps for solution](steps-slide.png)

# Load the input file into memory

In [None]:
import numpy as np
import pandas as pd

In [None]:
df_superheros = pd.read_csv('super-heros-dataset/superheroes_nlp_dataset.csv')

In [None]:
print(f'Loaded {df_superheros.shape[0]} records with {df_superheros.shape[1]} fields.')

# Map the target column

In [None]:
def normalize(x):
    return 1 if x == 'Marvel Comics' else 0

In [None]:
df_superheros['creator_normalized'] = df_superheros['creator'].apply(lambda x: normalize(x))

# Train / test data split

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df_superheros, test_size=0.2)

In [None]:
print(f'Split the data into train set of {train.shape[0]} records and {test.shape[0]} records.')

# Retrieve features from text columns

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    stop_words="english"
)

In [None]:
train_text_columns = train[~pd.isnull(train['history_text'])]['history_text']

In [None]:
test_text_columns = test[~pd.isnull(test['history_text'])]['history_text']

In [None]:
X_train = vectorizer.fit_transform(train_text_columns)

In [None]:
y_train = train[~pd.isnull(train['history_text'])]['creator_normalized']

In [None]:
X_test = vectorizer.transform(test_text_columns)

In [None]:
y_test = test[~pd.isnull(test['history_text'])]['creator_normalized']

In [None]:
feature_names = vectorizer.get_feature_names_out()

# Train classifier

In [None]:
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression(C=5, max_iter=1000)

In [None]:
classifier.fit(X_train, y_train)

In [None]:
predictions = classifier.predict(X_test)

# Show Metrics

In [None]:
from sklearn import metrics

score = metrics.accuracy_score(y_test, predictions)

In [None]:
print(score)

In [None]:
precision = metrics.precision_score(y_test, predictions)

In [225]:
print(precision)

0.9010989010989011
