<a href="https://colab.research.google.com/github/hermimimeow/hermionedeng/blob/main/chi_square_Feature_Selection_logistic_regression_%26_random_forest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Reading the Data

In [None]:
!pip install "tf-models-official==2.13.*"

Collecting tf-models-official==2.13.*
  Downloading tf_models_official-2.13.2-py2.py3-none-any.whl (2.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
Collecting immutabledict (from tf-models-official==2.13.*)
  Downloading immutabledict-4.1.0-py3-none-any.whl (4.5 kB)
Collecting sacrebleu (from tf-models-official==2.13.*)
  Downloading sacrebleu-2.4.0-py3-none-any.whl (106 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.3/106.3 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece (from tf-models-official==2.13.*)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m28.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting seqeval (from tf-models-official==2.13.*)
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import os
import shutil
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optimizer

import matplotlib.pyplot as plt

tf.get_logger().setLevel('ERROR')
import pandas as pd


In [None]:
import pandas as pd
import requests
from io import StringIO

# Replace this with your BOS URL
url = 'https://tufts.box.com/shared/static/423pwoe2cbf5hrw6wsfdo4pn83cynb2v.csv'

# If authentication is required, add the appropriate headers or tokens
response = requests.get(url)
response.raise_for_status()  # This will raise an error if the download failed

# Convert to a pandas DataFrame
data = StringIO(response.text)
df = pd.read_csv(data)





The dataset has three sentiments namely, negative(-1), neutral(0), and positive(+1). It contains two fields for the tweet and label.

In [None]:
df

Unnamed: 0,clean_text,category
0,when modi promised âminimum government maxim...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0
...,...,...
162975,why these 456 crores paid neerav modi not reco...,-1.0
162976,dear rss terrorist payal gawar what about modi...,-1.0
162977,did you cover her interaction forum where she ...,0.0
162978,there big project came into india modi dream p...,0.0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162980 entries, 0 to 162979
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   clean_text  162976 non-null  object 
 1   category    162973 non-null  float64
dtypes: float64(1), object(1)
memory usage: 2.5+ MB


In [None]:
df.columns

Index(['clean_text', 'category'], dtype='object')

# Data Preprocessing

In [None]:
#missing values - checking

# Check for missing values
missing_values = df.isnull().sum()
print(missing_values)


clean_text    4
category      7
dtype: int64


In [None]:
#handling missing values
df = df.dropna()


In [None]:
# Convert 'category' to integer if it's categorical
df['category'] = df['category'].astype(int)


In [None]:
#text preprocessing

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Download necessary NLTK data
nltk.download('stopwords')

# Define a function for text preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation and numbers
    text = re.sub(r'[^a-z]', ' ', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join(word for word in text.split() if word not in stop_words)
    # Stemming
    stemmer = PorterStemmer()
    text = ' '.join(stemmer.stem(word) for word in text.split())
    return text

# Apply the preprocessing function to the 'clean_text' column
df['clean_text'] = df['clean_text'].apply(lambda x: preprocess_text(x) if isinstance(x, str) else x)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Feature Selection - Using Chi-Square Feature selection

To implement Chi-square feature selection, we will typically use it in scenarios where you have categorical input features and a categorical target variable. Since we have text data in the 'clean_text' column and a numeric (possibly categorical) 'category' column, we'll first need to convert the text data into a numerical format that can be used for Chi-square testing.

In [None]:
#1. Vectorize the Text Data
#We'll use TF-IDF Vectorization as an example. It's a common technique to convert text to a numeric form.

from sklearn.feature_extraction.text import TfidfVectorizer

# Create a TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=1000)  # you can choose the number of features

# Apply the vectorizer to the 'clean_text' column
X_tfidf = tfidf.fit_transform(df['clean_text'].fillna(''))


In [None]:
#2. Apply Chi-Square Test
#Now, we'll apply the Chi-square test to select the features that have the strongest relationship with the target variable.


from sklearn.feature_selection import chi2
import pandas as pd

# Apply the Chi-square test
chi_scores = chi2(X_tfidf, df['category'])

# Create a DataFrame with feature names and their corresponding Chi-square scores
chi_scores_df = pd.DataFrame({'Feature': tfidf.get_feature_names_out(), 'Chi2Score': chi_scores[0], 'P-value': chi_scores[1]})


In [None]:
chi_scores_df

Unnamed: 0,Feature,Chi2Score,P-value
0,aap,4.399122,1.108518e-01
1,abhinandan,5.428419,6.625732e-02
2,abl,146.799011,1.327394e-32
3,absolut,142.127859,1.371916e-31
4,abt,12.423404,2.005820e-03
...,...,...,...
995,yogi,1.644970,4.393385e-01
996,young,164.877764,1.574883e-36
997,your,5.240012,7.280241e-02
998,youth,1.812425,4.040516e-01


In [None]:
chi_scores_df.sort_values(by='Chi2Score', ascending=False, inplace=True)
chi_scores_df

Unnamed: 0,Feature,Chi2Score,P-value
389,hate,1552.263514,0.000000e+00
671,poor,1346.619529,3.848439e-293
364,good,1340.364591,8.780661e-292
972,win,1332.540081,4.391350e-290
370,great,1215.110128,1.387357e-264
...,...,...,...
102,bjp,0.039125,9.806278e-01
32,along,0.033902,9.831917e-01
378,gut,0.029607,9.853055e-01
659,pl,0.014818,9.926186e-01


In [None]:
#Select top features

# Selecting features with the highest Chi-square scores
# we can choose a threshold or a number of top features
top_features = chi_scores_df.sort_values(by='Chi2Score', ascending=False).head(100)  # top 100 features

# we might also consider a p-value threshold
# For instance, selecting features with p-value less than 0.05
significant_features = chi_scores_df[chi_scores_df['P-value'] < 0.05]


In [None]:
#normalization and standardization
from sklearn.preprocessing import StandardScaler

# Standardize the numeric column (if needed)
scaler = StandardScaler()
df['category'] = scaler.fit_transform(df[['category']])


In [None]:
df

Unnamed: 0,clean_text,category
0,modi promis minimum govern maximum govern expe...,-1.568511
1,talk nonsens continu drama vote modi,-0.288555
2,say vote modi welcom bjp told rahul main campa...,0.991400
3,ask support prefix chowkidar name modi great s...,0.991400
4,answer among power world leader today trump pu...,0.991400
...,...,...
162975,crore paid neerav modi recov congress leader h...,-1.568511
162976,dear rss terrorist payal gawar modi kill plu m...,-1.568511
162977,cover interact forum left,-0.288555
162978,big project came india modi dream project happ...,-0.288555


#Logistic Regression Model

We've already transformed your text data into a numerical format and selected the top features using Chi-square feature selection. We'll use these features for training the SVM model.

In [None]:
# Convert 'category' to an integer type
df['category'] = df['category'].astype('int')
df

Unnamed: 0,clean_text,category
0,modi promis minimum govern maximum govern expe...,-1
1,talk nonsens continu drama vote modi,0
2,say vote modi welcom bjp told rahul main campa...,0
3,ask support prefix chowkidar name modi great s...,0
4,answer among power world leader today trump pu...,0
...,...,...
162975,crore paid neerav modi recov congress leader h...,-1
162976,dear rss terrorist payal gawar modi kill plu m...,-1
162977,cover interact forum left,0
162978,big project came india modi dream project happ...,0


In [None]:
import pandas as pd
from google.colab import files

# Assuming df is your DataFrame
# Save the DataFrame to a CSV file
df.to_csv('clean_data.csv')

# Download the file to your local machine
files.download('clean_data.csv')




<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from sklearn.feature_selection import SelectKBest

# Assuming 'top_features' contains the names of the top features
selected_features = top_features['Feature']

# Create a feature selector
feature_selector = SelectKBest(chi2, k=len(selected_features)).fit(X_tfidf, df['category'])

# Transform the dataset to select the top features
X_selected = feature_selector.transform(X_tfidf)


In [None]:
from sklearn.model_selection import train_test_split

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_selected, df['category'], test_size=0.2, random_state=42)


In [None]:
X_selected

<162969x100 sparse matrix of type '<class 'numpy.float64'>'
	with 321058 stored elements in Compressed Sparse Row format>

### Split the Data

In [None]:
print(y_test)

42229     0
22035     0
79982    -1
118493   -1
12815     0
         ..
47105     0
33632    -1
93676     0
37757     0
132997    0
Name: category, Length: 32594, dtype: int64


In [None]:



model = LogisticRegression()
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10,1000,10000],
    'solver': ['liblinear', 'lbfgs','saga','newton-cg'],
    'penalty': ['l2'],
    'max_iter': [100, 200, 500],  # Represents epochs

}


# Grid Search with Cross-Validation
grid_search = GridSearchCV(model, param_grid, cv=10, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best Parameters and Best Score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)
best_model = grid_search.best_estimator_

# Final Evaluation on Test Set
test_predictions = best_model.predict(X_test)
print("Test Set Evaluation:\n", classification_report(y_test, test_predictions))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best Parameters: {'C': 10000, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Best Score: 0.8633096966410738
Test Set Evaluation:
               precision    recall  f1-score   support

          -1       0.78      0.53      0.63      7152
           0       0.88      0.96      0.92     25442

    accuracy                           0.86     32594
   macro avg       0.83      0.74      0.77     32594
weighted avg       0.86      0.86      0.85     32594



## Random Forest Model

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.feature_selection import SelectKBest, chi2


In [None]:
# Assuming 'top_features' contains the names of the top features
selected_features = top_features['Feature']

# Create a feature selector
feature_selector = SelectKBest(chi2, k=len(selected_features)).fit(X_tfidf, df['category'])

# Transform the dataset to select the top features
X_selected = feature_selector.transform(X_tfidf)


In [None]:
# Assuming df is your DataFrame with 'text' and 'category' columns
#X_tfidf = tfidf.fit_transform(df['category'])


X = X_selected
y = df['category']
# If you have a feature selector, apply it here
# X_selected = feature_selector.transform(X_tfidf)
# X = X_selected

#X = X_tfidf  # Use this if you're not using a feature selector
#y = df['category']

# Split the dataset
# First, split into training and (temporary) test sets (e.g., 80-20%)
X_train, X_test, y_train, y_test = train_test_split(X_selected, df['category'], test_size=0.2, random_state=42)



# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 30],
    'min_samples_split': [2,4,6],
    'min_samples_leaf': [1],
    'max_features': ['sqrt', 'log2']
}

# Initialize a Random Forest classifier
rf = RandomForestClassifier(random_state=42)

# Initialize Grid Search with cross-validation
rf_grid_search = GridSearchCV(rf, param_grid, cv = 5, scoring='accuracy', n_jobs=-1, verbose=2)

# Fit the grid search model
rf_grid_search.fit(X_train, y_train)

# Get the best estimator
best_rf = rf_grid_search.best_estimator_

# Evaluate on the validation set

# Finally, evaluate on the test set
test_predictions = best_rf.predict(X_test)
print("Test Set Evaluation:\n", classification_report(y_test, test_predictions))


Fitting 5 folds for each of 36 candidates, totalling 180 fits
Test Set Evaluation:
               precision    recall  f1-score   support

          -1       0.80      0.43      0.56      7152
           0       0.86      0.97      0.91     25442

    accuracy                           0.85     32594
   macro avg       0.83      0.70      0.74     32594
weighted avg       0.85      0.85      0.83     32594



In [None]:
best_cv_score = rf_grid_search.best_score_
print("Best CV score (Grid Search):", best_cv_score)

Best CV score (Grid Search): 0.8506615532118887
