# NLP Model

**Imports**

In [1]:
import pandas as pd

**Load the dataset**

In [2]:
df = pd.read_csv("DATA/poem-classification.csv")

**Split data into X feature columns and y label column**

In [3]:
X = df["content"]
y = df[["age", "type"]]

**Remove special characters from the text**

In [4]:
import re
import string

def remove_special_chars(text):
    text = re.sub(r'[^\text00-\x7f]',r' ',text)
    text = re.sub("["+string.punctuation+"]", " ", text)
    text = text.replace('  ','')
    text = text.lower()
  
    return text

In [5]:
X = X.apply(remove_special_chars)

**Remove stopwords**

In [6]:
stops = {"ourselves", "hers", "between", "yourself", "but", "again", "there", "about", "once", "during", "out", "very", "having", "with", "they", "own", "an", "be", "some", "for", "do", "its", "yours", "such", "into", "of", "most", "itself", "other", "off", "is", "s", "am", "or", "who", "as", "from", "him", "each", "the", "themselves", "until", "below", "are", "we", "these", "your", "his", "through", "don", "nor", "me", "were", "her", "more", "himself", "this", "down", "should", "our", "their", "while", "above", "both", "up", "to", "ours", "had", "she", "all", "no", "when", "at", "any", "before", "them", "same", "and", "been", "have", "in", "will", "on", "does", "yourselves", "then", "that", "because", "what", "over", "why", "so", "can", "did", "not", "now", "under", "he", "you", "herself", "has", "just", "where", "too", "only", "myself", "which", "those", "i", "after", "few", "whom", "t", "being", "if", "theirs", "my", "against", "a", "by", "doing", "it", "how", "further", "was", "here", "than"} 
def remove_stopwords(text):
    text = [word for word in text.split() if word not in stops]
    return " ".join(text)

In [7]:
X = X.apply(remove_stopwords)

**Train | Test Split**

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

**Vectorization**

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()

**Estimator**

In [10]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression

estimator = MultiOutputClassifier(LogisticRegression(C=10))

**Pipeline**

In [11]:
from sklearn.pipeline import Pipeline
pipe = Pipeline([
    ('tfidf', tfidf),
    ('estimator', estimator)
    ])

**Train the model**

In [12]:
pipe.fit(X_train, y_train)

**Evaluate the model**

In [13]:
y_pred = pipe.predict(X_test)

In [14]:
from sklearn.metrics import classification_report
print(classification_report(y_test["age"], y_pred[:,0]))

              precision    recall  f1-score   support

      Modern       0.97      0.90      0.94        72
 Renaissance       0.93      0.98      0.96       100

    accuracy                           0.95       172
   macro avg       0.95      0.94      0.95       172
weighted avg       0.95      0.95      0.95       172



In [15]:
from sklearn.metrics import classification_report
print(classification_report(y_test["type"], y_pred[:,1]))

                      precision    recall  f1-score   support

                Love       0.69      0.88      0.77        99
Mythology & Folklore       0.33      0.10      0.15        21
              Nature       0.57      0.44      0.50        52

            accuracy                           0.65       172
           macro avg       0.53      0.47      0.47       172
        weighted avg       0.61      0.65      0.61       172



**Save the model**

In [16]:
pipe.fit(X, y)
y_pred = pipe.predict(X_test)

In [17]:
from joblib import dump
dump(pipe, '../model/model.joblib') 

['../model/model.joblib']