### **Bag of Words**

In [None]:
!pip install gensim

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
nltk.download("all")
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

In [None]:
df = pd.read_csv("preprocessed_content.csv")
print(df.columns)
print(df.shape)
print(df.info())
print(df.isnull().sum())
sns.histplot(x = "total_score",data = df)
print(df["total_score"].median())
df["label"] = df["total_score"].apply(lambda x : "Green" if x < df["total_score"].median() else "Non Green")
df.head()

x = df["preprocessed_content"].values
y = df["label"].values

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

y = le.fit_transform(y)

In [None]:
corpus = []
for i in range(len(x)) :
  text = nltk.sent_tokenize(x[i])
  processed_content = []
  for t in text :
    t = re.sub("[^a-zA-Z]"," ",str(t))
    t = t.lower()
    processed_content.append(t)
  corpus.append(" ".join(processed_content))
corpus

len(corpus[0])

In [None]:
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

sent = []
for s in corpus :
  words = nltk.word_tokenize(s)
  words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
  t = " ".join(words)
  sent.append(t)
sent

len(sent)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

x = cv.fit_transform(sent)

from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2,
                                                 random_state = 42)
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.naive_bayes import MultinomialNB

base_model = MultinomialNB()
bag_model = BaggingClassifier(
    estimator = base_model,
    n_estimators = 50,
    max_samples = 0.8,
    bootstrap = True,
    random_state = 42
)

bag_model.fit(x_train,y_train)

In [None]:
rdt_train = bag_model.predict(x_train)
prdt_test = bag_model.predict(x_test)

print("Predictions on Training Data : \n",prdt_train)
print("\nPredictions on Testing Data : \n",prdt_test)

In [None]:
rob_train = bag_model.predict_proba(x_train)
prob_test = bag_model.predict_proba(x_test)

print("Probabilities on Training Data : \n",prob_train)
print("\nProbabilities on Testing Data : \n",prob_test)

In [None]:
from sklearn.metrics import accuracy_score
acc_train = accuracy_score(y_train,prdt_train)
acc_test = accuracy_score(y_test,prdt_test)

print("Accuracy on Training Data : ",acc_train)
print("Accuracy on Testing Data : ",acc_test)

In [None]:
from sklearn.metrics import confusion_matrix,classification_report
cf = confusion_matrix(y_test,prdt_test)
print("Confusion Matrix : \n",cf)

sns.heatmap(cf,cmap = "coolwarm",annot = True)
plt.title("Confusion Matrix (Test Data)")

plt.show()

In [None]:
from sklearn.metrics import RocCurveDisplay
RocCurveDisplay.from_estimator(bag_model,x_test,y_test)

plt.grid(True)
plt.show()

### **TF - IDF**

In [None]:
!pip install gensim

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
import nltk
nltk.download("all")

In [None]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [None]:
import re

In [None]:
df = pd.read_csv("preprocessed_content.csv")

In [None]:
print(df.columns)

In [None]:
print(df.shape)

In [None]:
print(df.info())

In [None]:
print(df.isnull().sum())

In [None]:
sns.histplot(x = "total_score",data = df)

In [None]:
print(df["total_score"].median())

In [None]:
df["label"] = df["total_score"].apply(lambda x : "Green" if x < df["total_score"].median() else "Non Green")

In [None]:
x = df["preprocessed_content"].values
y = df["label"].values

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

y = le.fit_transform(y)

In [None]:
corpus = []
for i in range(len(x)) :
  text = nltk.sent_tokenize(x[i])
  processed_content = []
  for t in text :
    t = re.sub("[^a-zA-Z]"," ",str(t))
    t = t.lower()
    processed_content.append(t)
  corpus.append(" ".join(processed_content))
corpus

In [None]:
len(corpus[0])

In [None]:
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

sent = []
for s in corpus :
  words = nltk.word_tokenize(s)
  words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
  t = " ".join(words)
  sent.append(t)
sent

In [None]:
len(sent)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer()

In [None]:
x = tf.fit_transform(sent)

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2,
                                                 random_state = 42)
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.naive_bayes import ComplementNB

In [None]:
base_model = ComplementNB()
bag_model = BaggingClassifier(
    estimator = base_model,
    n_estimators = 90,
    max_samples = 0.8,
    bootstrap = True,
    random_state = 42
)

In [None]:
bag_model.fit(x_train,y_train)

In [None]:
prdt_train = bag_model.predict(x_train)
prdt_test = bag_model.predict(x_test)

print("Predictions on Training Data : \n",prdt_train)
print("\nPredictions on Testing Data : \n",prdt_test)

In [None]:
prob_train = bag_model.predict_proba(x_train)
prob_test = bag_model.predict_proba(x_test)

print("Probabilities on Training Data : \n",prob_train)
print("\nProbabilities on Testing Data : \n",prob_test)

In [None]:
from sklearn.metrics import accuracy_score
acc_train = accuracy_score(y_train,prdt_train)
acc_test = accuracy_score(y_test,prdt_test)

print("Accuracy on Training Data : ",acc_train)
print("Accuracy on Testing Data : ",acc_test)

In [None]:
from sklearn.metrics import confusion_matrix,classification_report
cf = confusion_matrix(y_test,prdt_test)
print("Confusion Matrix : \n",cf)

In [None]:
sns.heatmap(cf,cmap = "coolwarm",annot = True)
plt.title("Confusion Matrix (Test Data)")

plt.show()

### **Word2Vec**

In [None]:
!pip install gensim

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt

In [None]:
import re
import nltk
nltk.download("all")

In [None]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [None]:
df = pd.read_csv("preprocessed_content.csv")

In [None]:
print(df.columns)

In [None]:
print(df.shape)

In [None]:
print(df.info())

In [None]:
print(df.isnull().sum())

In [None]:
print(df.duplicated().sum())

In [None]:
sns.histplot(x = "total_score", data = df)

In [None]:
df["total_score"].median()

In [None]:
df["label"] = df["total_score"].apply(lambda x : "Green" if x < 29.9 else "Non green")

In [None]:
x = df["preprocessed_content"].values
y = df["label"].values

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

y = le.fit_transform(y)

In [None]:
corpus = []
for i in range(len(x)) :
  text = nltk.sent_tokenize(x[i])
  processed_content = []
  for t in text :
    t = re.sub("[^a-zA-Z]"," ",str(t))
    t = t.lower()
    processed_content.append(t)
  corpus.append(" ".join(processed_content))
corpus

In [None]:
len(corpus)

In [None]:
words = [nltk.word_tokenize(s) for s in corpus]
words

In [None]:
len(words[0])

In [None]:
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

for i in range(len(words)) :
  words[i] = [lemmatizer.lemmatize(w) for w in words[i] if w not in stop_words]
words

In [None]:
len(words[0])

In [None]:
from gensim.models import Word2Vec

In [None]:
model = Word2Vec(words,min_count = 2,vector_size = 300,sg = 1,
                 window = 7)

In [None]:
model.wv.key_to_index

In [None]:
print("Vector Representation : ",model.wv["investor"])
print("\nWords most similar to 'investor' : \n",model.wv.most_similar("investor"))

In [None]:
ef sent_to_vector (sent,model) :
  words = nltk.word_tokenize(sent)
  vector = [model.wv[w] for w in words if w in model.wv]

  if not vector :
    return np.zeros(300)

  return np.mean(vector,axis = 0)

In [None]:
x_vec = np.vstack([sent_to_vector(s,model) for s in corpus])

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x_vec,y,test_size = 0.2,
                                                 random_state = 42)
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
base_model = DecisionTreeClassifier()
bag_model = BaggingClassifier(
    estimator = base_model,
    n_estimators = 110,
    max_samples = 0.8,
    bootstrap = True,
    random_state = 42
)

In [None]:
bag_model.fit(x_train,y_train)

In [None]:
prdt_train = bag_model.predict(x_train)
prdt_test = bag_model.predict(x_test)

print("Predictions on Training Data : \n",prdt_train)
print("\nPredictions on Testing Data : \n",prdt_test)

In [None]:
prob_train = bag_model.predict_proba(x_train)
prob_test = bag_model.predict_proba(x_test)

print("Probabilities on Training Data : \n",prob_train)
print("\nProbabilities on Testing Data : \n",prob_test)

In [None]:
from sklearn.metrics import accuracy_score
acc_train = accuracy_score(y_train,prdt_train)
acc_test = accuracy_score(y_test,prdt_test)

print("Accuracy on Training Data : ",acc_train)
print("Accuracy on Testing Data : ",acc_test)

In [None]:
from sklearn.metrics import confusion_matrix,classification_report
cf = confusion_matrix(y_test,prdt_test)
print("Confusion Matrix : \n",cf)

In [None]:
sns.heatmap(cf,cmap = "coolwarm",annot = True)
plt.title("Confusion Matrix (Test Data)")

plt.show()