Gathering and Adjusting of Data

In [36]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score,classification_report
import re
from nltk.corpus import stopwords
import joblib

In [31]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\parda\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [23]:
movie_train = pd.read_csv('train_data.txt', delimiter=':::',names=['id','name','genre','description'])
movie_test = pd.read_csv('test_data.txt', delimiter=':::',names=['id','name','description'])
movie_test_solution = pd.read_csv('test_data_solution.txt', delimiter=':::',names=['id','name','genre','description'])

  movie_train = pd.read_csv('train_data.txt', delimiter=':::',names=['id','name','genre','description'])
  movie_test = pd.read_csv('test_data.txt', delimiter=':::',names=['id','name','description'])
  movie_test_solution = pd.read_csv('test_data_solution.txt', delimiter=':::',names=['id','name','genre','description'])


In [24]:
print(movie_train.head())


   id                                name       genre  \
0   1       Oscar et la dame rose (2009)       drama    
1   2                       Cupid (1997)    thriller    
2   3   Young, Wild and Wonderful (1980)       adult    
3   4              The Secret Sin (1915)       drama    
4   5             The Unrecovered (2007)       drama    

                                         description  
0   Listening in to a conversation between his do...  
1   A brother and sister with a past incestuous r...  
2   As the bus empties the students for their fie...  
3   To help their unemployed father make ends mee...  
4   The film's title refers not only to the un-re...  


In [25]:
print(movie_train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54214 entries, 0 to 54213
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           54214 non-null  int64 
 1   name         54214 non-null  object
 2   genre        54214 non-null  object
 3   description  54214 non-null  object
dtypes: int64(1), object(3)
memory usage: 1.7+ MB
None


In [26]:
print(movie_train['genre'].value_counts())

genre
 drama           13613
 documentary     13096
 comedy           7447
 short            5073
 horror           2204
 thriller         1591
 action           1315
 western          1032
 reality-tv        884
 family            784
 adventure         775
 music             731
 romance           672
 sci-fi            647
 adult             590
 crime             505
 animation         498
 sport             432
 talk-show         391
 fantasy           323
 mystery           319
 musical           277
 biography         265
 history           243
 game-show         194
 news              181
 war               132
Name: count, dtype: int64


In [27]:
print(movie_train.describe())

                 id
count  54214.000000
mean   27107.500000
std    15650.378084
min        1.000000
25%    13554.250000
50%    27107.500000
75%    40660.750000
max    54214.000000


Encoding Data

In [28]:
label_encoder=LabelEncoder()
movie_train['genre']=label_encoder.fit_transform(movie_train['genre'])
movie_test_solution['genre']=label_encoder.transform(movie_test_solution['genre'])

In [32]:
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = text.lower()
    text = ' '.join([word for word in text.split() if word not in stop_words])

    return text



In [33]:
movie_train['clean_description'] = movie_train['description'].apply(preprocess_text)
movie_test['clean_description'] = movie_test['description'].apply(preprocess_text)
movie_test_solution['clean_description'] = movie_test_solution['description'].apply(preprocess_text)

In [35]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
x_train = tfidf_vectorizer.fit_transform(movie_train['clean_description'])
x_test = tfidf_vectorizer.transform(movie_test['clean_description'])
y_train = movie_train['genre']
y_test = movie_test_solution['genre']

In [37]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

models = {
    'MultinomialNB': MultinomialNB(),
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'RandomForest': RandomForestClassifier(n_estimators=100)
}

best_model = None
best_accuracy = 0

for name, model in models.items():
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {accuracy}")
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = model

print(f"Best Model: {best_model}")


MultinomialNB Accuracy: 0.5231918819188192
LogisticRegression Accuracy: 0.5842066420664207
RandomForest Accuracy: 0.49437269372693726
Best Model: LogisticRegression(max_iter=1000)


In [38]:
y_pred=best_model.predict(x_test)
print(classification_report(y_test,y_pred,target_names=label_encoder.classes_))

               precision    recall  f1-score   support

      action        0.47      0.28      0.35      1314
       adult        0.59      0.23      0.33       590
   adventure        0.57      0.16      0.25       775
   animation        0.51      0.06      0.11       498
   biography        0.00      0.00      0.00       264
      comedy        0.53      0.59      0.55      7446
       crime        0.34      0.03      0.06       505
 documentary        0.67      0.85      0.75     13096
       drama        0.55      0.77      0.64     13612
      family        0.50      0.08      0.14       783
     fantasy        0.55      0.05      0.10       322
   game-show        0.90      0.50      0.64       193
     history        0.00      0.00      0.00       243
      horror        0.64      0.57      0.60      2204
       music        0.65      0.44      0.53       731
     musical        0.18      0.01      0.03       276
     mystery        0.38      0.01      0.02       318
        n

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [39]:
joblib.dump(best_model, 'best_genre_classification_model.pkl')
joblib.dump(label_encoder, 'label_encoder.pkl')
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')


['tfidf_vectorizer.pkl']

In [41]:
best_model = joblib.load('best_genre_classification_model.pkl')
label_encoder = joblib.load('label_encoder.pkl')
tfidf_vectorizer = joblib.load('tfidf_vectorizer.pkl')


In [44]:
movie_test['cleaned_description'] = movie_test['description'].apply(preprocess_text)
X_new = tfidf_vectorizer.transform(movie_test['cleaned_description']).toarray()
predicted_genres_encoded = best_model.predict(X_new)
predicted_genres = label_encoder.inverse_transform(predicted_genres_encoded)

In [47]:
output_file = 'predictions.txt'

with open(output_file, 'w') as file:
    file.write('ID\tTITLE\tPREDICTED_GENRE\n')
    for id, title, genre in zip(movie_test['id'], movie_test['name'], predicted_genres):
        file.write(f'{id}\t{title}\t{genre}\n')

print(f'Predictions saved to {output_file}')

Predictions saved to predictions.txt
