In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, accuracy_score
import mlflow
import mlflow.sklearn


In [2]:
# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\keert\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\keert\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
def preprocess_text(text):
    if pd.notnull(text):
        text = str(text).lower()
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        tokens = text.split()
        stop_words = set(stopwords.words('english'))
        tokens = [word for word in tokens if word not in stop_words]
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
        clean_text = ' '.join(tokens)
        return clean_text
    else:
        return ''

In [4]:
def preprocess_and_split_data(df, text_column, target_column, test_size=0.2, random_state=42):
    df['Cleaned Text'] = df[text_column].apply(preprocess_text)
    threshold = 3
    df['Sentiment'] = df[target_column].apply(lambda x: 1 if x >= threshold else 0)
    X_train, X_test, y_train, y_test = train_test_split(df['Cleaned Text'], df['Sentiment'], test_size=test_size, random_state=random_state)
    return X_train, X_test, y_train, y_test

In [5]:
def train_model(X_train, y_train, pipeline, param_grid, scoring='f1_weighted', cv=5):
    grid_search = GridSearchCV(pipeline, param_grid=param_grid, scoring=scoring, cv=cv, return_train_score=True, verbose=1)
    with mlflow.start_run() as run:
        %time grid_search.fit(X_train, y_train)
    return grid_search

In [6]:
def evaluate_model(model, X_test, y_test, X_train,y_train):
    y_test_pred = model.predict(X_test)
    y_train_pred = model.predict(X_train)
    return {
        'F1 Score': f1_score(y_test, y_test_pred, average='weighted'),
        'Accuracy': accuracy_score(y_test, y_test_pred),
        'Train Accuracy': accuracy_score(y_train, y_train_pred),
    }

In [7]:
def train_and_evaluate_models(X_train, y_train, X_test, y_test, pipelines, param_grids):
    results = {}
    for model_name, pipeline in pipelines.items():
        param_grid = param_grids[model_name]
        grid_search = train_model(X_train, y_train, pipeline, param_grid)
        results[model_name] = {'grid_search': grid_search}
    return results

In [8]:
def print_results(results, X_test, y_test):
    for model_name, metrics in results.items():
        print(f"Metrics for {model_name}:")
        print("Best hyperparameters:", metrics['grid_search'].best_params_)
        model = metrics['grid_search'].best_estimator_
        evaluation_metrics = evaluate_model(model, X_test, y_test)
        for metric, value in evaluation_metrics.items():
            print(f"{metric}: {value}")
        print()

In [9]:
def main():
    file_path = r"C:\Users\hridy\Documents\Innomatics\Sentiment Analysis\reviews_data_dump\reviews_badminton\data.csv"
    text_column = 'Review text'
    target_column = 'Ratings'
    
    df = pd.read_csv(file_path)
    
    X_train, X_test, y_train, y_test = preprocess_and_split_data(df, text_column, target_column)

    models = {
        'Random Forest': RandomForestClassifier(),
        'Logistic Regression': LogisticRegression(),
        'Support Vector Machine': SVC()
    }
    
    pipelines = {}
    for model_name, model in models.items():
        pipeline = Pipeline([
            ('vectorizer', TfidfVectorizer()),
            ('scaler', StandardScaler(with_mean=False)),
            ('model', model)
        ])
        pipelines[model_name] = pipeline

    param_grids = {
        'Random Forest': {
            'vectorizer__ngram_range': [(1, 1), (1, 2)],
            'model__n_estimators': [100, 200, 300],
            'model__max_depth': [None, 10, 20]
        },
        'Logistic Regression': {
            'vectorizer__ngram_range': [(1, 1), (1, 2)],
            'model__C': [0.1, 1.0, 10.0],
            'model__solver': ['liblinear', 'lbfgs']
        },
        'Support Vector Machine': {
            'vectorizer__ngram_range': [(1, 1), (1, 2)],
            'model__C': [0.1, 1.0, 10.0],
            'model__kernel': ['linear', 'rbf']
        }
    }
    
    results = train_and_evaluate_models(X_train, y_train, X_test, y_test, pipelines, param_grids)
    
    print_results(results, X_test, y_test)

In [None]:
if __name__ == "__main__":
    main()