# Information Retrieval Project
- 1. Get all the training model datasets from online
- 2. Change them to postive, negative, neutral
- 3. Combine all the datasets together
-----------------------------
- 4. Remove punctuation,stopwords, do lemmatization & stemming
- 5. Get tf-idf
- 6. Choose a machine learning model/deep learning model
    - ML: SVM,		Random Forest,		Logistic Regression(this),		Gradient Boosting 	Naive Bayes

    - DL: use with word embeddings like Word2Cev or GloVe to get semantic relationships between words
RNN		LSTM		CNN	   BERT(this)

- 7. Concatenate BERT value with TFIDF value
----------------------------------
- 8. Train Test Split using TFIDF values as Features and sentiment labels as Targets
- 9. Evaluate Accuracy, Precision, Recall, F1-score of Model using Logistic Regression
----------------------------------
- 10. Test Model on Manually Labelled data after Scraping from Web
- 11. Preprocess manually labeled data in the same way as online datasets
- 12. Compare and Calculate Accuracy of Model
---------------------------------
- 13. Identify strengths and weaknesses of model
- 14. Make appropriate adjustments
- 15. Report and recommendations

In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ZW\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ZW\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ZW\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# TF-IDF


In [None]:
def preprocess_text(text):
  # Remove special characters
  text = re.sub(r'[^a-zA-Z0-9\s]','', text) # pattern matches any character not alphanumeric

  # Tokenisation
  tokens = word_tokenize(text)

  # Remove punctuation
  tokens = [token for token in tokens if re.match(r'\b\w+\b',token)]

  # Remove stopwords
  stop_words = set(stopwords.words('english'))
  tokens = [token for token in tokens if token.lower() not in stop_words]

  # Lemmatization
  lemmatizer = WordNetLemmatizer()
  tokens = [lemmatizer.lemmatize(token) for token in tokens]

  # Stemming
  stemmer = PorterStemmer()
  tokens = [stemmer.stem(token) for token in tokens]

  return ' '.join(tokens)

# Read the CSV file
df = pd.read_csv('train_pos_neg.csv')

# Preprocess the 'Text' column
df['preprocessed_text'] = df['Text'].apply(preprocess_text)

# Calculate TF-IDF for each preprocessed headline
tfidf = TfidfVectorizer(ngram_range=(1,3))
tfidf_matrix = tfidf.fit_transform(df['preprocessed_text'])

# Get feature names (words)
feature_names = tfidf.get_feature_names_out()

tfidf_matrix_dense = tfidf_matrix.toarray()

tfidf_df = pd.DataFrame(tfidf_matrix_dense.astype('float32'),columns = feature_names)

# Print TF-IDF scores for each word in each text
for i,row in enumerate(tfidf_matrix_dense):
  non_zero_indices = row.nonzero()[0]
  print(f"Non-zero TF-IDF values for row{i+ 1}:")
  for index in non_zero_indices:
    word = feature_names[index]
    score = row[index]
    print(f"{word}: {score:.6f}")
  print()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
compani: 0.062500
compani said: 0.106542
earlier: 0.090837
earlier finnish: 0.146945
earlier finnish compani: 0.146945
fell: 0.091985
fell x20ac: 0.146945
fell x20ac 103: 0.146945
finnish: 0.071597
finnish compani: 0.121188
finnish compani said: 0.140819
march: 0.087246
march 31: 0.146945
march 31 fell: 0.146945
million: 0.217172
million us: 0.126000
million us 165: 0.146945
million x20ac: 0.133101
million x20ac 131: 0.146945
million year: 0.122628
million year earlier: 0.128017
month: 0.080550
month march: 0.140819
month march 31: 0.146945
net: 0.071341
net profit: 0.089600
net profit three: 0.146945
profit: 0.062055
profit three: 0.146945
profit three month: 0.146945
said: 0.068612
three: 0.096069
three month: 0.121188
three month march: 0.146945
us: 0.066484
us 165: 0.146945
us 165 million: 0.146945
x20ac: 0.237328
x20ac 103: 0.146945
x20ac 103 million: 0.146945
x20ac 131: 0.146945
x20ac 131 million: 0.146945
year: 0.0

# BERT word embeddings
-  Install Transformers via pip: pip install transformers

In [None]:
from transformers import BertTokenizer, BertModel,pipeline
import torch
import pandas as pd

# Load pre trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# Define a function for BERT preprocessing using pipeline
bert_pipeline = pipeline('feature-extraction',model=model,tokenizer=tokenizer)

print("here first")
def bert_preprocess_text(text):
  # Preprocess the text using BERT pipeline
  embeddings = bert_pipeline(text)

  # Use mean pooling to get a single embedding for the text
  return torch.mean(torch.tensor(embeddings[0]), dim=0).numpy()

counter = 0

# Convert text to word embeddings to create a new DataFrame
embeddings_list = []
for text in df['Text']:
  embeddings = bert_preprocess_text(text)
  embeddings_list.append(embeddings)
  print(counter)
  counter+=1
df_embeddings = pd.DataFrame(embeddings_list)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
7318
7319
7320
7321
7322
7323
7324
7325
7326
7327
7328
7329
7330
7331
7332
7333
7334
7335
7336
7337
7338
7339
7340
7341
7342
7343
7344
7345
7346
7347
7348
7349
7350
7351
7352
7353
7354
7355
7356
7357
7358
7359
7360
7361
7362
7363
7364
7365
7366
7367
7368
7369
7370
7371
7372
7373
7374
7375
7376
7377
7378
7379
7380
7381
7382
7383
7384
7385
7386
7387
7388
7389
7390
7391
7392
7393
7394
7395
7396
7397
7398
7399
7400
7401
7402
7403
7404
7405
7406
7407
7408
7409
7410
7411
7412
7413
7414
7415
7416
7417
7418
7419
7420
7421
7422
7423
7424
7425
7426
7427
7428
7429
7430
7431
7432
7433
7434
7435
7436
7437
7438
7439
7440
7441
7442
7443
7444
7445
7446
7447
7448
7449
7450
7451
7452
7453
7454
7455
7456
7457
7458
7459
7460
7461
7462
7463
7464
7465
7466
7467
7468
7469
7470
7471
7472
7473
7474
7475
7476
7477
7478
7479
7480
7481
7482
7483
7484
7485
7486
7487
7488
7489
7490
7491
7492
7493
7494
7495
7496
7497
7498
7499
7500
7501
7502
7503
7504


In [None]:
tfidf_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12318 entries, 0 to 12317
Columns: 185564 entries, 00 to zzzactli user watch
dtypes: float32(185564)
memory usage: 8.5 GB


# Combined TF-IDF and BERT word embeddings

In [None]:
counter2=0

#Convert all columns in df_embeddings to float32
embeddings_cols = df_embeddings.columns

for col in embeddings_cols:
  try:
    df_embeddings[col] = df_embeddings[col].astype('float32')
  except MemoryError:
    print("Memory Error2")
  counter2+=1
  print(counter2)


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277


In [None]:
# Combine together tfidf and word embeddings into one DataFrame
df_combined = pd.concat([tfidf_df,df_embeddings],axis=1)

# Train model with Train Test Split and self test model
- 0: negative
- 1: positive

In [None]:
#from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np

In [None]:
# Separate features(TF-IDF and word embeddings) and target variable
X = df_combined
y = df['Sentiment']
print("step 1 done")

# Define batch size
batch_size = 50

# Calculate the number of batches
num_batches_X = len(X) // batch_size
if len(X) % batch_size !=0:
  num_batches_X +=1

# Initialise SGD classifier with logistic regression loss
sgd_model = SGDClassifier(loss='log',max_iter=1000)

# Initialise lists to store training and testing data
X_train_batches = []
X_test_batches = []
y_train_batches = []
y_test_batches = []

# Split the data into batches and then split each batch into training and testing data
for i in range(num_batches_X):
  start_idx = i * batch_size
  end_idx = min((i+1)*batch_size,len(X))
  X_batch = X[start_idx:end_idx]
  y_batch = y[start_idx:end_idx]

  # Split the data into training and testing sets
  X_train, X_test, y_train, y_test = train_test_split(X_batch,y_batch,test_size=0.2,
                                                      random_state=42)

  # Append the training and testing data for the current batch
  X_train_batches.append(X_train)
  X_test_batches.append(X_test)
  y_train_batches.append(y_train)
  y_test_batches.append(y_test)

print("step 2 done")

# Fit the logistic regression model in batches
for i in range(num_batches_X):
  X_train = X_train_batches[i]
  y_train = y_train_batches[i]

  #Convert feature names to strings
  X_train.columns = X_train.columns.astype(str)

  # Fit the logisitic regression model on the current batch
  sgd_model.partial_fit(X_train,y_train,classes=np.unique(y))
print("step 3 done")

print("step 4 done")

print("step 5 done")

## Make predictions on the testing data
y_pred = []
for i in range(num_batches_X):
  X_test = X_test_batches[i]

  #Convert feature names to strings
  X_test.columns = X_test.columns.astype(str)

  y_pred.extend(sgd_model.predict(X_test))

print("step 6 done")

## Evaluate the model
y_test = np.concatenate(y_test_batches)
accuracy = accuracy_score(y_test, y_pred)
classification_report = classification_report(y_test,y_pred)
print("step 7 done")

print(f'Accuracy: {accuracy}')
print(f'Classification report:\n{classification_report}')

step 1 done
step 2 done




step 3 done
step 4 done
step 5 done
step 6 done
step 7 done
Accuracy: 0.6956168831168831
Classification report:
              precision    recall  f1-score   support

           0       0.57      0.84      0.68       948
           1       0.86      0.61      0.71      1516

    accuracy                           0.70      2464
   macro avg       0.71      0.72      0.69      2464
weighted avg       0.75      0.70      0.70      2464



In [None]:
!pip install joblib

import joblib

# Save the model to a file
joblib.dump(sgd_model, 'sgd_model_posneg.pkl')

['sgd_model_posneg.pkl']

# Test on manually labelled data

# TF-IDF

In [None]:
# Read the manually labelled data
df_manual = pd.read_csv('test_pos_neg.csv')

# Preprocess the 'Headline column in df_manual
df_manual['preprocessed_text'] = df_manual['Headline'].apply(preprocess_text)

# Calculate TF-IDF for each preprocessed headline in df_manual
tfidf_manual = tfidf.transform(df_manual['preprocessed_text'])

# Get feature names(words)
feature_names_manual = tfidf.get_feature_names_out()

# Convert TF-IDF values to a DataFrame
tfidf_matrix_dense_manual = tfidf_manual.toarray()
tfidf_df_manual = pd.DataFrame(tfidf_matrix_dense_manual, columns=feature_names_manual)

# BERT word embeddings

In [None]:
# Load pre trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# Define a function for BERT preprocessing using pipeline
bert_pipeline = pipeline('feature-extraction',model=model,tokenizer=tokenizer)

print("here first")

counter = 0

# Convert text to word embeddings to create a new DataFrame
embeddings_list = []
for text in df_manual['Headline']:
  embeddings = bert_preprocess_text(text)
  embeddings_list.append(embeddings)
  print(counter)
  counter+=1

df_embeddings_manual = pd.DataFrame(embeddings_list)

here first
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274

# Combined TF-IDF with BERT word embeddings

In [None]:
# Concatenate TF-IDF values with word embeddings
df_combined_manual = pd.concat([tfidf_df_manual, df_embeddings_manual], axis=1)

# Perform Classification using previously trained and saved model
- 0: negative
- 1: positive

In [None]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

In [None]:
# Read the CSV file
df_manual = pd.read_csv('test_pos_neg.csv')
print("1 file imported")

1 file imported


In [None]:
#Convert feature names to strings
df_combined_manual.columns = df_combined_manual.columns.astype(str)

##-for sgd classifier only
y_pred_manual = sgd_model.predict(df_combined_manual)
print("step 1 done")

y_test_manual = df_manual['FINAL Sentiment']

## Evaluate the model
##-for just logistic regression
accuracy = accuracy_score(y_test_manual, y_pred_manual)
report = classification_report(y_test_manual,y_pred_manual)
print("step 2 done")

print(f'Accuracy: {accuracy}')
print(f'Classification report:\n{report}')

step 1 done
step 2 done
Accuracy: 0.7799834574028123
Classification report:
              precision    recall  f1-score   support

           0       0.80      0.75      0.77       601
           1       0.77      0.81      0.79       608

    accuracy                           0.78      1209
   macro avg       0.78      0.78      0.78      1209
weighted avg       0.78      0.78      0.78      1209

