In [1]:
import nltk
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import re
sns.set_theme()
import mlflow
import joblib

from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import hstack
from collections import Counter
from nltk.corpus import stopwords

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
train= train.drop('url', axis=1)
train.head()

Unnamed: 0,headlines,description,content,category
0,RBI revises definition of politically-exposed ...,The central bank has also asked chairpersons a...,The Reserve Bank of India (RBI) has changed th...,business
1,NDTV Q2 net profit falls 57.4% to Rs 5.55 cror...,NDTV's consolidated revenue from operations wa...,Broadcaster New Delhi Television Ltd on Monday...,business
2,"Akasa Air ‘well capitalised’, can grow much fa...",The initial share sale will be open for public...,Homegrown server maker Netweb Technologies Ind...,business
3,India’s current account deficit declines sharp...,The current account deficit (CAD) was 3.8 per ...,India’s current account deficit declined sharp...,business
4,"States borrowing cost soars to 7.68%, highest ...",The prices shot up reflecting the overall high...,States have been forced to pay through their n...,business


In [4]:
y_train = train['category']
y_test = test['category']
X_train = train.drop('category', axis=1)
X_test = test.drop('category', axis=1)

In [5]:
datasets = [X_train, X_test]

In [6]:
def remove_punctuation_and_special_characters(text):
  """Removes punctuation and special characters from a string.

  Args:
    text: The input string.

  Returns:
    The string with punctuation and special characters removed.
  """
  pattern = r'[^\w\s]'  # Matches any character that is NOT a word character (\w) or whitespace (\s)
  return re.sub(pattern, ' ', text)

In [7]:
def clean_dataframe_column_string(df, column_name):
    df[column_name] = df[column_name].apply(remove_punctuation_and_special_characters)
    return df

In [8]:
for df in datasets:
    for column in df.columns:
        clean_dataframe_column_string(df, column)

In [9]:
X_train['headlines'] = X_train['headlines'].astype(str)
X_train['description'] = X_train['description'].astype(str)
X_train['content'] = X_train['content'].astype(str)

X_test['headlines'] = X_test['headlines'].astype(str)
X_test['description'] = X_test['description'].astype(str)
X_test['content'] = X_test['content'].astype(str)

X_train['headlines'] = X_train['headlines'].fillna('')  # Replace NaN with empty string
X_train['description'] = X_train['description'].fillna('')
X_train['content'] = X_train['content'].fillna('')

X_test['headlines'] = X_test['headlines'].fillna('')
X_test['description'] = X_test['description'].fillna('')
X_test['content'] = X_test['content'].fillna('')

In [11]:
# Combine headlines and description
X_train_combined_text = X_train['headlines'] + " " + X_train['description'] + " " + X_train['content']
X_test_combined_text = X_test['headlines'] + " " + X_test['description'] + " " + X_test['content']

with mlflow.start_run():
    # Vectorize the combined text
    vectorizer_combined = CountVectorizer(stop_words='english', min_df=2, max_df=0.5)
    X_train_combined_vectorized = vectorizer_combined.fit_transform(X_train_combined_text)
    X_test_combined_vectorized = vectorizer_combined.transform(X_test_combined_text)

    vectorizer_path = "vectorizer.pkl"
    joblib.dump(vectorizer_combined, vectorizer_path)
    mlflow.log_artifact(vectorizer_path, "vectorizers")

    best_model = LogisticRegression(C=10, max_iter=1000, solver='liblinear')
    best_model.fit(X_train_combined_vectorized, y_train)
    y_pred = best_model.predict(X_test_combined_vectorized)

    # Log the logistic regression model
    mlflow.sklearn.log_model(best_model, "logistic_regression_model")

    model_path = "Logistic_Regression.pkl"
    joblib.dump(best_model, model_path)
    mlflow.log_artifact(model_path, "log_model")

    # Log metrics (e.g., classification report)
    classification_rep = classification_report(y_test, y_pred, output_dict=True)
    for key, value in classification_rep.items():
        if isinstance(value, dict):  # Log precision, recall, F1-score, etc.
            for subkey, subvalue in value.items():
                mlflow.log_metric(f"{key}_{subkey}", subvalue)
        else:  # Log accuracy
            mlflow.log_metric(key, value)

    # Print the classification report
    print("Logistic Regression Classification Report:")
    print(classification_report(y_test, y_pred))



Logistic Regression Classification Report:
               precision    recall  f1-score   support

     business       0.98      0.96      0.97       400
    education       0.99      0.98      0.99       400
entertainment       0.99      0.99      0.99       400
       sports       0.99      0.99      0.99       400
   technology       0.95      0.97      0.96       400

     accuracy                           0.98      2000
    macro avg       0.98      0.98      0.98      2000
 weighted avg       0.98      0.98      0.98      2000

