# News Data Analysis and Classification

### Importing Libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report


### Load data from the CSV file into a pandas DataFrame

In [2]:
data = pd.read_csv('News_Data_Set.csv')

data.dropna(subset=['News', 'Section'], inplace=True)

### DataFrame has columns named 'News' containing independent variable and 'Section' containing labels

In [3]:
X = data['News'].astype(str)  
y = data['Section']  


### Preprocess the text data using TF-IDF vectorization

In [4]:

vectorizer = TfidfVectorizer()  
X = vectorizer.fit_transform(X)

### Split the dataset into training and testing sets

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=20)

### Create Random Forest Classifier model

In [6]:
rf_model = RandomForestClassifier()  
rf_model.fit(X_train, y_train)


### Define the parameter grid for GridSearchCV For Hyperparameter tuning

In [7]:
param_grid = {
    'n_estimators': range(1,200)
}

### Perform GridSearchCV to find the best hyperparameters

In [8]:
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)



### Get the best parameters and their corresponding score

In [9]:
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Best parameters: {best_params}")
print(f"Accuracy score: {best_score:.4f}")

Best parameters: {'n_estimators': 164}
Corresponding accuracy score: 0.5500


### Train the final model using the best parameters on the entire training set

In [12]:
best_rf_model = RandomForestClassifier(n_estimators=best_params['n_estimators'])  
best_rf_model.fit(X_train, y_train)

best_predictions = best_rf_model.predict(X_test)

### Evaluate the performance of the best model on the test set

In [14]:
print(f"Best Model Accuracy: {best_score:.4f}")
print("Best Model Classification Report:")
print(classification_report(y_test, best_predictions))

Best Model Accuracy: 0.5500
Best Model Classification Report:
              precision    recall  f1-score   support

 Agriculture       0.00      0.00      0.00         1
       Books       0.00      0.00      0.00         3
       Caste       0.00      0.00      0.00         1
     Culture       0.00      0.00      0.00         1
   Diplomacy       1.00      0.50      0.67         2
     Economy       0.00      0.00      0.00         0
 Environment       0.00      0.00      0.00         1
  Government       0.00      0.00      0.00         2
         Law       0.38      1.00      0.55         3
       Media       0.00      0.00      0.00         1
    Politics       0.00      0.00      0.00         0
    Security       0.33      0.50      0.40         2
  South Asia       1.00      1.00      1.00         1
    The Arts       0.00      0.00      0.00         1
       Video       0.00      0.00      0.00         1
       World       1.00      1.00      1.00         1

    accuracy      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
