In [1]:
import pandas as pd

In [2]:
# Importing the dependencies
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report


# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\himanshu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\himanshu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\himanshu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\himanshu\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [3]:
df_train = pd.read_csv('Poem_classification - train_data.csv')
df_test = pd.read_csv('Poem_classification - test_data.csv')

In [4]:
df_train.head()

Unnamed: 0,Genre,Poem
0,Music,
1,Music,In the thick brushthey spend the...
2,Music,Storms are generous. ...
3,Music,—After Ana Mendieta Did you carry around the ...
4,Music,for Aja Sherrard at 20The portent may itself ...


In [5]:
df_test.head()

Unnamed: 0,Genre,Poem
0,Music,A woman walks by the bench I’m sitting onwith ...
1,Music,"Because I am a boy, the untouchability of beau..."
2,Music,"Because today we did not leave this world,We n..."
3,Music,"Big Bend has been here, been here. Shouldn’t i..."
4,Music,"I put shells there, along the lip of the road...."


In [6]:
df_train.shape

(841, 2)

In [7]:
df_test.shape

(150, 2)

In [8]:
df_train.isnull().sum()

Genre    0
Poem     4
dtype: int64

In [9]:
df_train.dropna(inplace=True)

In [10]:
df_test.isnull().sum()

Genre    0
Poem     0
dtype: int64

In [11]:
df_train['Genre'].value_counts()

Music          238
Death          231
Environment    227
Affection      141
Name: Genre, dtype: int64

In [12]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 837 entries, 1 to 840
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Genre   837 non-null    object
 1   Poem    837 non-null    object
dtypes: object(2)
memory usage: 19.6+ KB


In [13]:
class_mapping = {'Music': 1, 'Death': 2, 'Environment': 3, 'Affection': 4}
df_train['Genre'] = df_train['Genre'].replace(class_mapping)
df_train['Genre'] = df_train['Genre'].astype('int')

In [14]:
df_train

Unnamed: 0,Genre,Poem
1,1,In the thick brushthey spend the...
2,1,Storms are generous. ...
3,1,—After Ana Mendieta Did you carry around the ...
4,1,for Aja Sherrard at 20The portent may itself ...
5,1,"for Bob Marley, Bavaria, November 1980 Here i..."
...,...,...
836,3,Why make so much of fragmentary blue In here a...
837,3,"Woman, I wish I didn't know your name. What co..."
838,3,"Yonder to the kiosk, beside the creek, Paddle ..."
839,3,You come to fetch me from my work to-night Whe...


In [15]:
df_test['Genre'] = df_train['Genre'].replace(class_mapping)
df_test['Genre'] = df_train['Genre'].astype('int')

In [16]:
df_test.isna().sum()


Genre    1
Poem     0
dtype: int64

In [17]:
df_test.dropna(inplace=True)

In [18]:
df_train.isna().sum()


Genre    0
Poem     0
dtype: int64

In [19]:
df_test.isna().sum()


Genre    0
Poem     0
dtype: int64

In [20]:
df_train.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 837 entries, 1 to 840
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Genre   837 non-null    int32 
 1   Poem    837 non-null    object
dtypes: int32(1), object(1)
memory usage: 48.6+ KB


# Text_Processing(Vectorization, StopWords, Lemmitization)

In [21]:
# Define the text preprocessing function
def text_preprocessing(df, text_columns):
    # Define the regular expression to remove unwanted characters and patterns
    clean_regex = r"[^a-zA-Z\s]+"

    # Preprocess the text data
    for column in text_columns:
        df[column] = df[column].str.lower().replace(clean_regex, '', regex=True)
        df[column] = df[column].apply(nltk.word_tokenize)
        stop_words = stopwords.words('english')
        df[column] = df[column].apply(lambda x: [word for word in x if word not in stop_words])
        lemmatizer = WordNetLemmatizer()
        df[column] = df[column].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

In [22]:
# Split the data into training and test sets
#df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

# Perform text preprocessing on the training and test sets
text_columns = ['Poem']
text_preprocessing(df_train, text_columns)
text_preprocessing(df_test, text_columns)

# Define the feature matrix and target vector for the training set
X_train = df_train['Poem'].apply(lambda x: ' '.join(x))
y_train = df_train['Genre']

# Define the feature matrix and target vector for the test set
X_test = df_test['Poem'].apply(lambda x: ' '.join(x))
y_test = df_test['Genre']

In [23]:
X_train

1      thick brushthey spend hottest part day soaking...
2      storm generous something easy surrender sittin...
3      ana mendieta carry around matin star hold fore...
4      aja sherrard portent may memory wallace steven...
5      bob marley bavaria november brilliant morning ...
                             ...                        
836    make much fragmentary blue bird butterfly flow...
837    woman wish didnt know name could silence house...
838    yonder kiosk beside creek paddle swift caque t...
839    come fetch work tonight supper table well see ...
840    see water glass liquid air plenty liquid water...
Name: Poem, Length: 837, dtype: object

# Define the pipelines and parameters for grid search

In [24]:
# Define the pipelines and parameters for grid search
pipelines = [
    ('nb', Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])),
    ('lr', Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', LogisticRegression())])),
    ('dt', Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', DecisionTreeClassifier())])),
    ('rf', Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', RandomForestClassifier())]))
]

parameters = [
    {
        'vect__ngram_range': [(1, 1), (1, 2)],
        'tfidf__use_idf': [True, False],
        'clf__alpha': [1, 0.1, 0.01, 0.001]
    },
    {
        'vect__ngram_range': [(1, 1), (1, 2)],
        'tfidf__use_idf': [True, False],
        'clf__C': [1, 10, 100]
    },
    {
        'vect__ngram_range': [(1, 1), (1, 2)],
        'tfidf__use_idf': [True, False],
        'clf__criterion': ['gini', 'entropy'],
        'clf__max_depth': [None, 5, 10, 20],
        'clf__min_samples_split': [2, 5, 10]
    },
    {
        'vect__ngram_range': [(1, 1), (1, 2)],
        'tfidf__use_idf': [True, False],
        'clf__n_estimators': [10, 50, 100],
        'clf__max_depth': [None, 5, 10, 20],
        'clf__min_samples_split': [2, 5, 10]
    }
]

# Define the grid search


In [25]:
# Define the grid search
best_score = 0
for pipeline, params in zip(pipelines, parameters):
    text_clf = GridSearchCV(pipeline[1], params, cv=5, scoring='accuracy')
    text_clf.fit(X_train, y_train)
    y_pred = text_clf.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    if score > best_score:
        best_score = score
        best_model = text_clf
    print(f"Model: {pipeline[0]}")
    print(f"Best parameters: {text_clf.best_params_}")
    print(f"Best score: {text_clf.best_score_}")
    print(f"Classification report: \n{classification_report(y_test, y_pred)}\n")

print(f"The best model is {best_model.estimator.named_steps['clf'].__class__.__name__} with an accuracy score of {best_score}.")


Model: nb
Best parameters: {'clf__alpha': 1, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 1)}
Best score: 0.42658967778728263
Classification report: 
              precision    recall  f1-score   support

         1.0       1.00      0.37      0.54       149
         2.0       0.00      0.00      0.00         0
         3.0       0.00      0.00      0.00         0
         4.0       0.00      0.00      0.00         0

    accuracy                           0.37       149
   macro avg       0.25      0.09      0.13       149
weighted avg       1.00      0.37      0.54       149




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_

Model: lr
Best parameters: {'clf__C': 10, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)}
Best score: 0.4313587111491303
Classification report: 
              precision    recall  f1-score   support

         1.0       1.00      0.31      0.47       149
         2.0       0.00      0.00      0.00         0
         3.0       0.00      0.00      0.00         0
         4.0       0.00      0.00      0.00         0

    accuracy                           0.31       149
   macro avg       0.25      0.08      0.12       149
weighted avg       1.00      0.31      0.47       149


Model: dt
Best parameters: {'clf__criterion': 'gini', 'clf__max_depth': 20, 'clf__min_samples_split': 5, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}
Best score: 0.39912318220701454
Classification report: 
              precision    recall  f1-score   support

         1.0       1.00      0.54      0.70       149
         2.0       0.00      0.00      0.00         0
         3.0       0.00      0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Model: rf
Best parameters: {'clf__max_depth': None, 'clf__min_samples_split': 5, 'clf__n_estimators': 100, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 1)}
Best score: 0.43846592529227263
Classification report: 
              precision    recall  f1-score   support

         1.0       1.00      0.31      0.47       149
         2.0       0.00      0.00      0.00         0
         3.0       0.00      0.00      0.00         0
         4.0       0.00      0.00      0.00         0

    accuracy                           0.31       149
   macro avg       0.25      0.08      0.12       149
weighted avg       1.00      0.31      0.47       149


The best model is DecisionTreeClassifier with an accuracy score of 0.5369127516778524.


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
