In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.corpus import stopwords
import string

# Load the dataset
# Assuming the dataset is in a CSV file named 'predicted_genres.csv'
df = pd.read_csv('predicted_genres.csv')

# Display the first few rows of the dataset
print(df.head())

# Text preprocessing function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

# Apply text preprocessing to the 'Description' column
nltk.download('stopwords')
df['Description'] = df['Description'].apply(preprocess_text)

# Vectorize the text using TF-IDF
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df['Description']).toarray()
y = df['Predicted_Genre']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the models
nb_model = MultinomialNB()
lr_model = LogisticRegression(max_iter=1000)
svm_model = SVC(kernel='linear')

# Train the models
nb_model.fit(X_train, y_train)
lr_model.fit(X_train, y_train)
svm_model.fit(X_train, y_train)

# Make predictions
nb_predictions = nb_model.predict(X_test)
lr_predictions = lr_model.predict(X_test)
svm_predictions = svm_model.predict(X_test)

# Evaluate the models
print("Naive Bayes Accuracy:", accuracy_score(y_test, nb_predictions))
print("Logistic Regression Accuracy:", accuracy_score(y_test, lr_predictions))
print("SVM Accuracy:", accuracy_score(y_test, svm_predictions))

print("\nNaive Bayes Classification Report:\n", classification_report(y_test, nb_predictions))
print("\nLogistic Regression Classification Report:\n", classification_report(y_test, lr_predictions))
print("\nSVM Classification Report:\n", classification_report(y_test, svm_predictions))


   Id                          Title  \
0   1          Edgar's Lunch (1998)    
1   2      La guerra de papá (1977)    
2   3   Off the Beaten Track (2010)    
3   4        Meu Amigo Hindu (2015)    
4   5             Er nu zhai (1955)    

                                         Description Predicted_Genre  
0   L.R. Brane loves his life - his car, his apar...          drama   
1   Spain, March 1964: Quico is a very naughty ch...          drama   
2   One year in the life of Albin and his family ...    documentary   
3   His father has died, he hasn't spoken with hi...    documentary   
4   Before he was known internationally as a mart...    documentary   


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DEV\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
