# Information Retrieval Project
- 1. Get all the training model datasets from online
- 2. Change them to postive, negative, neutral
- 3. Combine all the datasets together
-----------------------------
- 4. Remove punctuation,stopwords, do lemmatization & stemming
- 5. Get tf-idf
- 6. Choose a machine learning model/deep learning model
    - ML: SVM,		Random Forest,		Logistic Regression(this),		Gradient Boosting 	Naive Bayes

    - DL: use with word embeddings like Word2Cev or GloVe to get semantic relationships between words
RNN		LSTM		CNN	   BERT(this)

- 7. Concatenate BERT value with TFIDF value
----------------------------------
- 8. Train Test Split using TFIDF values as Features and sentiment labels as Targets
- 9. Evaluate Accuracy, Precision, Recall, F1-score of Model using Logistic Regression
----------------------------------
- 10. Test Model on Manually Labelled data after Scraping from Web
- 11. Preprocess manually labeled data in the same way as online datasets
- 12. Compare and Calculate Accuracy of Model
---------------------------------
- 13. Identify strengths and weaknesses of model
- 14. Make appropriate adjustments
- 15. Report and recommendations

In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ZW\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ZW\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ZW\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Neutral / Opinionated

# TF-IDF

In [5]:
def preprocess_text(text):
  # Remove special characters
  text = re.sub(r'[^a-zA-Z0-9\s]','', text) # pattern matches any character not alphanumeric

  # Tokenisation
  tokens = word_tokenize(text)

  # Remove punctuation
  tokens = [token for token in tokens if re.match(r'\b\w+\b',token)]

  # Remove stopwords
  stop_words = set(stopwords.words('english'))
  tokens = [token for token in tokens if token.lower() not in stop_words]

  # Lemmatization
  lemmatizer = WordNetLemmatizer()
  tokens = [lemmatizer.lemmatize(token) for token in tokens]

  # Stemming
  stemmer = PorterStemmer()
  tokens = [stemmer.stem(token) for token in tokens]

  return ' '.join(tokens)

In [None]:
# Read the CSV file
df = pd.read_csv('train_neu_opi.csv')

# Preprocess the 'Text' column
df['preprocessed_text'] = df['Headline'].apply(preprocess_text)

# Calculate TF-IDF for each preprocessed headline
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df['preprocessed_text'])

# Get feature names (words)
feature_names = tfidf.get_feature_names_out()

tfidf_matrix_dense = tfidf_matrix.toarray()

tfidf_df = pd.DataFrame(tfidf_matrix_dense,columns = feature_names)

# Print TF-IDF scores for each word in each text
for i,row in enumerate(tfidf_matrix_dense):
  non_zero_indices = row.nonzero()[0]
  print(f"Non-zero TF-IDF values for row{i+ 1}:")
  for index in non_zero_indices:
    word = feature_names[index]
    score = row[index]
    print(f"{word}: {score:.6f}")
  print()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
sharp: 0.339838
stock: 0.135181

Non-zero TF-IDF values for row13672:
12: 0.214226
30k: 0.350609
3x: 0.335884
ad: 0.209270
avg: 0.282631
daili: 0.225029
easili: 0.317332
hold: 0.180068
overnight: 0.269818
pdsb: 0.342505
run: 0.221232
share: 0.154035
today: 0.336915
volum: 0.186097

Non-zero TF-IDF values for row13673:
anoth: 0.195339
bottom: 0.214908
chart: 0.234207
cmf: 0.314461
fast: 0.240285
flow: 0.213046
good: 0.163496
httpstcobcxqtgatei: 0.337039
indi: 0.337039
key: 0.200081
momentum: 0.236355
money: 0.199598
pace: 0.257485
pdsb: 0.307193
plu: 0.244754
show: 0.190468
stock: 0.110852

Non-zero TF-IDF values for row13674:
buyer: 0.501692
nice: 0.396141
pdsb: 0.674468
see: 0.369418

Non-zero TF-IDF values for row13675:
acceler: 0.257669
compani: 0.133155
energi: 0.186145
financ: 0.192088
green: 0.205573
growth: 0.180422
httpstcoq95rvvvyzg: 0.351984
market: 0.132694
nanocap: 0.351984
peck: 0.676383
profit: 0.155923
trad

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



# BERT word embeddings
-  Install Transformers via pip: pip install transformers

In [None]:
from transformers import BertTokenizer, BertModel,pipeline
import torch
import pandas as pd

# Load pre trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# Define a function for BERT preprocessing using pipeline
bert_pipeline = pipeline('feature-extraction',model=model,tokenizer=tokenizer)

print("here first")
def bert_preprocess_text(text):
  # Preprocess the text using BERT pipeline
  embeddings = bert_pipeline(text)

  # Use mean pooling to get a single embedding for the text
  return torch.mean(torch.tensor(embeddings[0]), dim=0).numpy()

counter = 0

# Convert text to word embeddings to create a new DataFrame
embeddings_list = []
for text in df['Headline']:
  embeddings = bert_preprocess_text(text)
  embeddings_list.append(embeddings)
  print(counter)
  counter+=1
df_embeddings = pd.DataFrame(embeddings_list)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
17935
17936
17937
17938
17939
17940
17941
17942
17943
17944
17945
17946
17947
17948
17949
17950
17951
17952
17953
17954
17955
17956
17957
17958
17959
17960
17961
17962
17963
17964
17965
17966
17967
17968
17969
17970
17971
17972
17973
17974
17975
17976
17977
17978
17979
17980
17981
17982
17983
17984
17985
17986
17987
17988
17989
17990
17991
17992
17993
17994
17995
17996
17997
17998
17999
18000
18001
18002
18003
18004
18005
18006
18007
18008
18009
18010
18011
18012
18013
18014
18015
18016
18017
18018
18019
18020
18021
18022
18023
18024
18025
18026
18027
18028
18029
18030
18031
18032
18033
18034
18035
18036
18037
18038
18039
18040
18041
18042
18043
18044
18045
18046
18047
18048
18049
18050
18051
18052
18053
18054
18055
18056
18057
18058
18059
18060
18061
18062
18063
18064
18065
18066
18067
18068
18069
18070
18071
18072
18073
18074
18075
18076
18077
18078
18079
18080
18081
18082
18083
18084
18085
18086
18087
18088
18089
18090

# Combined TF-IDF and BERT word embeddings

In [None]:
df_combined_neuopi = pd.concat([tfidf_df,df_embeddings],axis=1)

# Train model with Train Test Split and self test model
- 0: negative
- 1: positive

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

In [None]:
# Separate features(TF-IDF and word embeddings) and target variable
X = df_combined_neuopi
y = df['Sentiment']
print("step 1 done")

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,
                                                    random_state=42)
print("step 2 done")

#Convert feature names to strings
X_train.columns = X_train.columns.astype(str)
X_test.columns = X_test.columns.astype(str)

##Initialise and train a logistic regression model for multiclass classification
logreg_model1 = LogisticRegression(max_iter=1000)
  # or 'ovr' for one-vs-rest
logreg_model1.fit(X_train, y_train)
print("step 3 done")

## Use logistic regression predictions as features, RF+LR
# X_train_logreg = logreg_model.predict_proba(X_train)
# X_test_logreg = logreg_model.predict_proba(X_test)
print("step 4 done")

## Train Random Forest on top of logistic regression predictions
# rf_model = RandomForestClassifier(n_estimators=100,random_state=42)
#--rf_model.fit(X_train_logreg, y_train) #RF +LR
# rf_model.fit(X_train, y_train) #RF only
print("step 5 done")

## Make predictions on the testing data
y_pred = logreg_model1.predict(X_test)

#--for random forest+logistic regression(RF+LR)
# y_pred_rf = rf_model.predict(X_test_logreg)

#--for RF only
# y_pred_rf = rf_model.predict(X_test)
print("step 6 done")

## Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_report = classification_report(y_test,y_pred)

#--RF, RF+LR
#accuracy = accuracy_score(y_test, y_pred_rf)
#classification_report = classification_report(y_test,y_pred_rf)
print("step 7 done")

print(f'Accuracy: {accuracy}')
print(f'Classification report:\n{classification_report}')

step 1 done
step 2 done
step 3 done
step 4 done
step 5 done
step 6 done
step 7 done
Accuracy: 0.8558971005014171
Classification report:
              precision    recall  f1-score   support

     NEUTRAL       0.83      0.86      0.84      2098
 OPINIONATED       0.88      0.86      0.87      2489

    accuracy                           0.86      4587
   macro avg       0.85      0.86      0.86      4587
weighted avg       0.86      0.86      0.86      4587



In [None]:
!pip install joblib

import joblib

# Save the model to a file
joblib.dump(logreg_model1, 'logreg_model_neuopi.joblib')

['logreg_model_neuopi.joblib']

# Test on manually labelled data


# TF-IDF

In [None]:
# Read the manually labelled data
df_manual = pd.read_csv('test_neu_opi.csv')

# Preprocess the 'Headline column in df_manual
df_manual['preprocessed_text'] = df_manual['Headline'].apply(preprocess_text)

# Calculate TF-IDF for each preprocessed headline in df_manual
tfidf_manual = tfidf.transform(df_manual['preprocessed_text'])

# Get feature names(words)
feature_names_manual = tfidf.get_feature_names_out()

# Convert TF-IDF values to a DataFrame
tfidf_matrix_dense_manual = tfidf_manual.toarray()
tfidf_df_manual = pd.DataFrame(tfidf_matrix_dense_manual, columns=feature_names_manual)

# BERT word embeddings

In [None]:
# Load pre trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# Define a function for BERT preprocessing using pipeline
bert_pipeline = pipeline('feature-extraction',model=model,tokenizer=tokenizer)

print("here first")

counter = 0

# Convert text to word embeddings to create a new DataFrame
embeddings_list = []
for text in df_manual['Headline']:
  embeddings = bert_preprocess_text(text)
  embeddings_list.append(embeddings)
  print(counter)
  counter+=1

df_embeddings_manual = pd.DataFrame(embeddings_list)

here first
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274

In [None]:
df_combined_manual = pd.concat([tfidf_df_manual,df_embeddings_manual],axis=1)

# Perform Classification using previously trained and saved model
- 0: negative
- 1: positive

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

In [None]:
#Convert feature names to strings
df_combined_manual.columns = df_combined_manual.columns.astype(str)

## Make predictions using the trained model
##-for logistic regression only
y_pred_manual = logreg_model1.predict(df_combined_manual)

##-for random forest and logistic regression
#X_pred_manual_logreg = logreg_model.predict_proba(df_combined_manual)
#y_pred_manual_rf = rf_model.predict(X_pred_manual_logreg)

##-for random forest only
#y_pred_manual_rf = rf_model.predict(df_combined_manual)
print("step 1 done")

y_test_manual = df_manual['Sentiment']

## Evaluate the model
##-for just logistic regression
accuracy = accuracy_score(y_test_manual, y_pred_manual)
report = classification_report(y_test_manual,y_pred_manual)

##-for random forest only, and random forest + logistic regression
#accuracy = accuracy_score(y_test_manual, y_pred_manual_rf)
#report = classification_report(y_test_manual,y_pred_manual_rf)
print("step 2 done")

print(f'Accuracy: {accuracy}')
print(f'Classification report:\n{report}')

step 1 done
step 2 done
Accuracy: 0.5798712697483909
Classification report:
              precision    recall  f1-score   support

     NEUTRAL       0.38      0.73      0.50       500
 OPINIONATED       0.82      0.52      0.64      1209

    accuracy                           0.58      1709
   macro avg       0.60      0.62      0.57      1709
weighted avg       0.69      0.58      0.60      1709



# Positive / Negative

In [None]:
# Read the CSV file
df = pd.read_csv('train_pos_neg.csv')

# Preprocess the 'Text' column
df['preprocessed_text'] = df['Headline'].apply(preprocess_text)

# Calculate TF-IDF for each preprocessed headline
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df['preprocessed_text'])

# Get feature names (words)
feature_names = tfidf.get_feature_names_out()

tfidf_matrix_dense = tfidf_matrix.toarray()

tfidf_df = pd.DataFrame(tfidf_matrix_dense,columns = feature_names)

# Print TF-IDF scores for each word in each text
for i,row in enumerate(tfidf_matrix_dense):
  non_zero_indices = row.nonzero()[0]
  print(f"Non-zero TF-IDF values for row{i+ 1}:")
  for index in non_zero_indices:
    word = feature_names[index]
    score = row[index]
    print(f"{word}: {score:.6f}")
  print()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
mln: 0.681878
net: 0.137487
period: 0.147128
profit: 0.119590
quarter: 0.145156
third: 0.178048
vaisala: 0.242824

Non-zero TF-IDF values for row11993:
averag: 0.266033
british: 0.327053
consider: 0.335423
dollar: 0.259741
euro: 0.246940
impact: 0.287043
pound: 0.317942
price: 0.176073
said: 0.195790
sale: 0.182757
upm: 0.365308
us: 0.189717
weaker: 0.354473

Non-zero TF-IDF values for row11994:
2006: 0.279524
2007: 0.260440
earn: 0.226846
impact: 0.316156
implement: 0.363102
meiklejohn: 0.461849
mr: 0.357509
neg: 0.293015
program: 0.311932
said: 0.215647

Non-zero TF-IDF values for row11995:
ad: 0.224789
apart: 0.327888
becom: 0.281758
deal: 0.222752
decid: 0.315210
devalu: 0.397576
moment: 0.324331
normal: 0.303450
number: 0.257864
sale: 0.173280
say: 0.193297
therefor: 0.352666

Non-zero TF-IDF values for row11996:
cut: 0.273463
explain: 0.420394
fall: 0.291184
frequenc: 0.508031
last: 0.279655
new: 0.234477
said: 0.25

# BERT word embeddings
-  Install Transformers via pip: pip install transformers

In [None]:
from transformers import BertTokenizer, BertModel,pipeline
import torch
import pandas as pd

# Load pre trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# Define a function for BERT preprocessing using pipeline
bert_pipeline = pipeline('feature-extraction',model=model,tokenizer=tokenizer)

print("here first")
def bert_preprocess_text(text):
  # Preprocess the text using BERT pipeline
  embeddings = bert_pipeline(text)

  # Use mean pooling to get a single embedding for the text
  return torch.mean(torch.tensor(embeddings[0]), dim=0).numpy()

counter = 0

# Convert text to word embeddings to create a new DataFrame
embeddings_list = []
for text in df['Headline']:
  embeddings = bert_preprocess_text(text)
  embeddings_list.append(embeddings)
  print(counter)
  counter+=1
df_embeddings = pd.DataFrame(embeddings_list)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
7318
7319
7320
7321
7322
7323
7324
7325
7326
7327
7328
7329
7330
7331
7332
7333
7334
7335
7336
7337
7338
7339
7340
7341
7342
7343
7344
7345
7346
7347
7348
7349
7350
7351
7352
7353
7354
7355
7356
7357
7358
7359
7360
7361
7362
7363
7364
7365
7366
7367
7368
7369
7370
7371
7372
7373
7374
7375
7376
7377
7378
7379
7380
7381
7382
7383
7384
7385
7386
7387
7388
7389
7390
7391
7392
7393
7394
7395
7396
7397
7398
7399
7400
7401
7402
7403
7404
7405
7406
7407
7408
7409
7410
7411
7412
7413
7414
7415
7416
7417
7418
7419
7420
7421
7422
7423
7424
7425
7426
7427
7428
7429
7430
7431
7432
7433
7434
7435
7436
7437
7438
7439
7440
7441
7442
7443
7444
7445
7446
7447
7448
7449
7450
7451
7452
7453
7454
7455
7456
7457
7458
7459
7460
7461
7462
7463
7464
7465
7466
7467
7468
7469
7470
7471
7472
7473
7474
7475
7476
7477
7478
7479
7480
7481
7482
7483
7484
7485
7486
7487
7488
7489
7490
7491
7492
7493
7494
7495
7496
7497
7498
7499
7500
7501
7502
7503
7504


# Combined TF-IDF and BERT word embeddings

In [None]:
# Combine together tfidf and word embeddings into one DataFrame
df_combined_posneg = pd.concat([tfidf_df,df_embeddings],axis=1)

# Train model with Train Test Split and self test model
- 0: negative
- 1: positive

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

In [None]:
# Separate features(TF-IDF and word embeddings) and target variable
X = df_combined_posneg
y = df['Sentiment']
print("step 1 done")

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,
                                                    random_state=42)
print("step 2 done")

#Convert feature names to strings
X_train.columns = X_train.columns.astype(str)
X_test.columns = X_test.columns.astype(str)

##Initialise and train a logistic regression model for multiclass classification
logreg_model2 = LogisticRegression(max_iter=1000)
  # or 'ovr' for one-vs-rest
logreg_model2.fit(X_train, y_train)
print("step 3 done")

## Use logistic regression predictions as features, RF+LR
# X_train_logreg = logreg_model.predict_proba(X_train)
# X_test_logreg = logreg_model.predict_proba(X_test)
print("step 4 done")

## Train Random Forest on top of logistic regression predictions
# rf_model = RandomForestClassifier(n_estimators=100,random_state=42)
#--rf_model.fit(X_train_logreg, y_train) #RF +LR
# rf_model.fit(X_train, y_train) #RF only
print("step 5 done")

## Make predictions on the testing data
y_pred = logreg_model2.predict(X_test)

#--for random forest+logistic regression(RF+LR)
# y_pred_rf = rf_model.predict(X_test_logreg)

#--for RF only
# y_pred_rf = rf_model.predict(X_test)
print("step 6 done")

## Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_report = classification_report(y_test,y_pred)

#--RF, RF+LR
#accuracy = accuracy_score(y_test, y_pred_rf)
#classification_report = classification_report(y_test,y_pred_rf)
print("step 7 done")

print(f'Accuracy: {accuracy}')
print(f'Classification report:\n{classification_report}')

step 1 done
step 2 done
step 3 done
step 4 done
step 5 done
step 6 done
step 7 done
Accuracy: 0.8185876623376623
Classification report:
              precision    recall  f1-score   support

    NEGATIVE       0.75      0.76      0.75       911
    POSITIVE       0.86      0.86      0.86      1553

    accuracy                           0.82      2464
   macro avg       0.81      0.81      0.81      2464
weighted avg       0.82      0.82      0.82      2464



In [None]:
!pip install joblib

import joblib

# Save the model to a file
joblib.dump(logreg_model2, 'logreg_model_posneg.joblib')

['logreg_model_posneg.joblib']

# Test on manually labelled data





# TF-IDF

In [None]:
# Read the manually labelled data
df_manual = pd.read_csv('test_pos_neg.csv')

# Preprocess the 'Headline column in df_manual
df_manual['preprocessed_text'] = df_manual['Headline'].apply(preprocess_text)

# Calculate TF-IDF for each preprocessed headline in df_manual
tfidf_manual = tfidf.transform(df_manual['preprocessed_text'])

# Get feature names(words)
feature_names_manual = tfidf.get_feature_names_out()

# Convert TF-IDF values to a DataFrame
tfidf_matrix_dense_manual = tfidf_manual.toarray()
tfidf_df_manual = pd.DataFrame(tfidf_matrix_dense_manual,
                               columns=feature_names_manual)

# BERT word embeddings

In [None]:
# Load pre trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# Define a function for BERT preprocessing using pipeline
bert_pipeline = pipeline('feature-extraction',model=model,tokenizer=tokenizer)

print("here first")

counter = 0

# Convert text to word embeddings to create a new DataFrame
embeddings_list = []
for text in df_manual['Headline']:
  embeddings = bert_preprocess_text(text)
  embeddings_list.append(embeddings)
  print(counter)
  counter+=1

df_embeddings_manual = pd.DataFrame(embeddings_list)

here first
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274

# Combined TF-IDF with BERT word embeddings

In [None]:
# Concatenate TF-IDF values with word embeddings
df_combined_manual = pd.concat([tfidf_df_manual, df_embeddings_manual], axis=1)

# Perform Classification using previously trained and saved model
- 0: negative
- 1: positive

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

In [None]:
#Convert feature names to strings
df_combined_manual.columns = df_combined_manual.columns.astype(str)

## Make predictions using the trained model
##-for logistic regression only
y_pred_manual = logreg_model2.predict(df_combined_manual)

##-for random forest and logistic regression
#X_pred_manual_logreg = logreg_model.predict_proba(df_combined_manual)
#y_pred_manual_rf = rf_model.predict(X_pred_manual_logreg)

##-for random forest only
#y_pred_manual_rf = rf_model.predict(df_combined_manual)
print("step 1 done")

y_test_manual = df_manual['Sentiment']

## Evaluate the model
##-for just logistic regression
accuracy = accuracy_score(y_test_manual, y_pred_manual)
report = classification_report(y_test_manual,y_pred_manual)

##-for random forest only, and random forest + logistic regression
#accuracy = accuracy_score(y_test_manual, y_pred_manual_rf)
#report = classification_report(y_test_manual,y_pred_manual_rf)
print("step 2 done")

print(f'Accuracy: {accuracy}')
print(f'Classification report:\n{report}')

step 1 done
step 2 done
Accuracy: 0.7857733664185277
Classification report:
              precision    recall  f1-score   support

    NEGATIVE       0.84      0.70      0.77       601
    POSITIVE       0.75      0.87      0.80       608

    accuracy                           0.79      1209
   macro avg       0.79      0.79      0.78      1209
weighted avg       0.79      0.79      0.78      1209



# Testing model on remaining 9k data so that can plot graphs

In [3]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import BertTokenizer, BertModel,pipeline
import torch
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import joblib
import time

• Perform a random accuracy test on the rest of the data and discuss results

• Discuss performance metrics, e.g., records classified per second, and scalability of the system


# TF-IDF

In [None]:
# Read the manually labelled data
df_9k = pd.read_csv('combined_csv_dupDropped_9kUnlabelled.csv')

# Columns to drop
columns_to_drop = ['Source','Posted','Description','Link','Sentiment1','Sentiment2']

# Assuming df_combined_9k is your DataFrame and columns_to_drop is a list of column names to drop
df_9k.drop(columns=columns_to_drop, inplace=True)

df_9k.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9207 entries, 0 to 9206
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Headline         9207 non-null   object 
 1   FINAL Sentiment  0 non-null      float64
dtypes: float64(1), object(1)
memory usage: 144.0+ KB


In [None]:
# Preprocess the 'Headline column in df_manual
df_9k['preprocessed_text'] = df_9k['Headline'].apply(preprocess_text)

# Calculate TF-IDF for each preprocessed headline in df_manual
tfidf = TfidfVectorizer()
tfidf_9k = tfidf.fit_transform(df_9k['preprocessed_text'])

# Get feature names(words)
feature_names_9k = tfidf.get_feature_names_out()

# Convert TF-IDF values to a DataFrame
tfidf_matrix_dense_9k = tfidf_9k.toarray()
tfidf_df_9k = pd.DataFrame(tfidf_matrix_dense_9k, columns=feature_names_9k)

# Print TF-IDF scores for each word in each text
for i,row in enumerate(tfidf_matrix_dense_9k):
  non_zero_indices = row.nonzero()[0]
  print(f"Non-zero TF-IDF values for row{i+ 1}:")
  for index in non_zero_indices:
    word = feature_names_9k[index]
    score = row[index]
    print(f"{word}: {score:.6f}")
  print()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
ceo: 0.262072
cofound: 0.364686
deepmind: 0.335796
hire: 0.379930
microsoft: 0.198064
mustafa: 0.443790
suleyman: 0.418683
unit: 0.332283

Non-zero TF-IDF values for row8727:
ai: 0.152939
art: 0.417931
battlefield: 0.510101
buy: 0.207514
electron: 0.428781
invest: 0.233713
nasdaqea: 0.510101

Non-zero TF-IDF values for row8728:
investingcom: 0.227379
keybanc: 0.442159
microsoft: 0.221385
multipl: 0.411415
pay: 0.330557
premium: 0.407624
price: 0.268429
today: 0.268139
worth: 0.339326

Non-zero TF-IDF values for row8729:
etf: 0.292537
includ: 0.418927
leverag: 0.463431
megacap: 0.412055
msft: 0.367898
new: 0.213672
stock: 0.147359
target: 0.291342
tech: 0.258389

Non-zero TF-IDF values for row8730:
account: 0.485969
ad: 0.377088
im: 0.517057
march: 0.329840
retir: 0.467403
stock: 0.164411

Non-zero TF-IDF values for row8731:
2024: 0.301085
esg: 0.624515
march: 0.357445
mustbuy: 0.599847
stock: 0.178171

Non-zero TF-IDF val

# BERT Word Embeddings


In [None]:
# Load pre trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# Define a function for BERT preprocessing using pipeline
bert_pipeline = pipeline('feature-extraction',model=model,tokenizer=tokenizer)

print("here first")

counter = 0

# Convert text to word embeddings to create a new DataFrame
embeddings_list = []
for text in df_9k['Headline']:
  embeddings = bert_preprocess_text(text)
  embeddings_list.append(embeddings)
  print(counter)
  counter+=1

df_embeddings_9k = pd.DataFrame(embeddings_list)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393


In [None]:
# Concatenate TF-IDF values with word embeddings
df_combined_9k = pd.concat([tfidf_df_9k, df_embeddings_9k], axis=1)

In [None]:
#df_combined_9k.to_csv('df_combined_9k.csv',index=False)
df_combined_9k = pd.read_csv('df_combined_9k.csv')

In [None]:
#X = df_combined_9k
y = df_9k['FINAL Sentiment']

#Convert feature names to strings
df_combined_9k.columns = df_combined_9k.columns.astype(str)

# Speed of prediction
1. Opinionated / Neutral

- Matching model features with test set

In [None]:
from joblib import load

# Specify file path of the model
neuopi_model_file_path = 'logreg_model_neuopi.joblib'

# Load the model from the file
loaded_neuopi_model = load(neuopi_model_file_path)

In [None]:
df_combined_9k.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9207 entries, 0 to 9206
Columns: 9457 entries, 002 to 767
dtypes: float64(9457)
memory usage: 664.3 MB


In [None]:
num_rows = len(df_combined_9k)
print(num_rows)

9207


In [None]:
# Get feature names seen during fit time
fit_time_features = loaded_neuopi_model.feature_names_in_
#fit_time_features = logreg_model1.feature_names_in_
print(fit_time_features)
print(len(fit_time_features))

['00' '000' '000063' ... '765' '766' '767']
32583


In [None]:
#Get feature names in test data
print(df_combined_9k.columns)
print(len(df_combined_9k.columns))

Index(['002', '004', '008', '010', '012', '013', '016', '01667', '017', '018',
       ...
       '758', '759', '760', '761.1', '762', '763', '764.1', '765', '766.1',
       '767'],
      dtype='object', length=9457)
9457


In [None]:
# Identify features present in test data but not in training
missing_features2 = [feature for feature in df_combined_9k.columns if
                     feature not in loaded_neuopi_model.feature_names_in_]
#missing_features2 = [feature for feature in df_combined_9k.columns if
#                     feature not in logreg_model1.feature_names_in_]

# Remove features from test data
df_combined_9k_filtered = df_combined_9k.drop(columns=missing_features2,
                                              errors='ignore')

df_combined_9k_filtered.info()

#Get feature names in test data
print(df_combined_9k_filtered.columns)
print(len(df_combined_9k_filtered.columns))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9207 entries, 0 to 9206
Columns: 5949 entries, 002 to 767
dtypes: float64(5949)
memory usage: 417.9 MB
Index(['002', '004', '008', '010', '012', '013', '016', '017', '018', '025',
       ...
       '755', '756', '757', '758', '759', '760', '762', '763', '765', '767'],
      dtype='object', length=5949)
5949


In [None]:
# Identify features present during training but are missing in the test data
missing_features1 = [feature for feature in
                     loaded_neuopi_model.feature_names_in_ if feature not in
                     df_combined_9k_filtered.columns]
#missing_features1 = [feature for feature in
#                     logreg_model1.feature_names_in_ if feature not in
#                     df_combined_9k_filtered.columns]

# Perform Imputation, fill missing features with 0
df_combined_9k_filled = df_combined_9k_filtered.copy()
df_combined_9k_filled[missing_features1] = 0

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  df_combined_9k_filled[missing_features1] = 0
  df_combined_9k_filled[missing_features1] = 0
  df_combined_9k_filled[missing_features1] = 0
  df_combined_9k_filled[missing_features1] = 0
  df_combined_9k_filled[missing_features1] = 0
  df_combined_9k_filled[missing_features1] = 0
  df_combined_9k_filled[missing_features1] = 0
  df_combined_9k_filled[missing_features1] = 0
  df_combined_9k_filled[missing_features1] = 0
  df_combined_9k_filled[missing_features1] = 0
  df_combined_9k_filled[missing_features1] = 0
  df_combined_9k_filled[missing_features1] = 0
  df_combined_9k_filled[missing_features1] = 0
  df_combined_9k_filled[missing_features1] = 0
  df_combined_9k_filled[missing_features1] = 0
  df_combined_9k_filled[missing_features1] = 0
  df_combined_9k_filled[missing_features1] = 0
  df_combined_9k_filled[missing_features1] = 0
  df_combined_9k_filled[missing_features1] = 0
  df_combined_9k_filled[missing_features1]

In [None]:
df_combined_9k_filled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9207 entries, 0 to 9206
Columns: 31928 entries, 002 to zzzactli
dtypes: float64(5949), int64(25979)
memory usage: 2.2 GB


In [None]:
import numpy as np

# Set print options to display the entire array
np.set_printoptions(threshold=np.inf)

# Get feature names seen during fit time
print(len(fit_time_features))
print(fit_time_features)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
 'benelux' 'benfenati' 'benign' 'benioff' 'benitec' 'benjamin' 'bennet'
 'benoemd' 'benrab' 'bento' 'benwerschkul' 'benzen' 'benzinga' 'beptf'
 'bercow' 'berenberg' 'bergenbio' 'berger' 'berggruen' 'bergman'
 'bergqvist' 'bergvik' 'berhad' 'berkeley' 'berkshir' 'berl' 'berlin'
 'berman' 'bern' 'bernahm' 'bernank' 'bernard' 'bernersle' 'bernhard'
 'berni' 'berniesand' 'bernir' 'bernstein' 'berri' 'bertrand' 'besid'
 'best' 'besti' 'bestin' 'bestinclass' 'bestknown' 'bestmanag' 'bestofbre'
 'bestperform' 'bestrat' 'bestsel' 'bet' 'beta' 'betatest' 'beteiligung'
 'betoni' 'betonituot' 'betray' 'betsi' 'bett' 'better' 'better320'
 'betterthanexpect' 'beverag' 'beverli' 'bevff' 'bewa' 'bewar' 'beyer'
 'beyonc' 'beyond' 'bezo' 'bezoshav' 'bffmi' 'bfin' 'bg' 'bgc' 'bgcp'
 'bgg' 'bgmd' 'bgne' 'bgr' 'bharat' 'bharti' 'bhatia' 'bhe' 'bhf' 'bhp'
 'bhubaneswar' 'bhushan' 'bhvn' 'bi' 'bia' 'bias' 'bib' 'bic'
 'bicentenari' 'bicker' 'b

In [None]:
test_list = df_combined_9k_filled.columns

diff1 = [item for item in fit_time_features if item not in test_list]
diff2 = [item for item in test_list if item not in fit_time_features]

print(diff1)
print(diff2)

[]
[]


In [None]:
df_combined_9k_filled_reordered = df_combined_9k_filled[fit_time_features]
df_combined_9k_filled_reordered.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9207 entries, 0 to 9206
Columns: 32583 entries, 00 to 767
dtypes: float64(6604), int64(25979)
memory usage: 2.2 GB


In [None]:
print(df_combined_9k_filled_reordered.columns)

Index(['00', '000', '000063', '0001', '00025', '0008', '001', '00117', '00133',
       '002',
       ...
       '758', '759', '760', '761', '762', '763', '764', '765', '766', '767'],
      dtype='object', length=32583)


- Calculating prediction speed for Opinionated/Neutral

In [None]:
#32583, exactly the same

## Make predictions using the trained model
##-for logistic regression only
start_time1 = time.time()
y_pred_9k = loaded_neuopi_model.predict(df_combined_9k_filled_reordered)
#y_pred_9k = logreg_model1.predict(df_combined_9k_filled_reordered)
end_time1 = time.time()
time_all1 = end_time1 - start_time1

#print(y_pred_9k)
#print("Time taken for all predictions: ", time_all1, " seconds")
print("Time taken for one prediction: ", time_all1/num_rows, " seconds")

['NEUTRAL' 'NEUTRAL' 'OPINIONATED' 'NEUTRAL' 'NEUTRAL' 'NEUTRAL' 'NEUTRAL'
 'NEUTRAL' 'NEUTRAL' 'NEUTRAL' 'NEUTRAL' 'OPINIONATED' 'NEUTRAL'
 'OPINIONATED' 'OPINIONATED' 'OPINIONATED' 'NEUTRAL' 'NEUTRAL'
 'OPINIONATED' 'NEUTRAL' 'NEUTRAL' 'OPINIONATED' 'OPINIONATED' 'NEUTRAL'
 'NEUTRAL' 'NEUTRAL' 'OPINIONATED' 'OPINIONATED' 'OPINIONATED' 'NEUTRAL'
 'OPINIONATED' 'OPINIONATED' 'NEUTRAL' 'OPINIONATED' 'NEUTRAL' 'NEUTRAL'
 'NEUTRAL' 'NEUTRAL' 'NEUTRAL' 'NEUTRAL' 'OPINIONATED' 'OPINIONATED'
 'OPINIONATED' 'NEUTRAL' 'NEUTRAL' 'OPINIONATED' 'NEUTRAL' 'NEUTRAL'
 'OPINIONATED' 'NEUTRAL' 'OPINIONATED' 'NEUTRAL' 'NEUTRAL' 'NEUTRAL'
 'NEUTRAL' 'NEUTRAL' 'OPINIONATED' 'OPINIONATED' 'NEUTRAL' 'NEUTRAL'
 'OPINIONATED' 'NEUTRAL' 'OPINIONATED' 'NEUTRAL' 'NEUTRAL' 'NEUTRAL'
 'NEUTRAL' 'OPINIONATED' 'NEUTRAL' 'NEUTRAL' 'OPINIONATED' 'NEUTRAL'
 'NEUTRAL' 'NEUTRAL' 'NEUTRAL' 'OPINIONATED' 'NEUTRAL' 'NEUTRAL' 'NEUTRAL'
 'NEUTRAL' 'NEUTRAL' 'NEUTRAL' 'NEUTRAL' 'NEUTRAL' 'NEUTRAL' 'NEUTRAL'
 'NEUTRAL' 'NEUTRA

2. Positive / Negative
- Matching model features with test set

In [None]:
from joblib import load

# Specify file path of the model
posneg_model_file_path = 'logreg_model_posneg.joblib'

# Load the model from the file
loaded_posneg_model = load(posneg_model_file_path)

In [None]:
# Add predictions to the DataFrame
df_combined_9k_filled_reordered['FINAL Sentiment'] = y_pred_9k

# Create a new DataFrame excluding rows with 'neutral' sentiment
df_filtered = df_combined_9k_filled_reordered[df_combined_9k_filled_reordered
 ['FINAL Sentiment'] != 'NEUTRAL'].copy()

  df_combined_9k_filled_reordered['FINAL Sentiment'] = y_pred_9k
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_combined_9k_filled_reordered['FINAL Sentiment'] = y_pred_9k


In [None]:
df_combined_9k_filled_reordered.info()
df_filtered.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9207 entries, 0 to 9206
Columns: 32584 entries, 00 to FINAL Sentiment
dtypes: float64(6604), int64(25979), object(1)
memory usage: 2.2+ GB
<class 'pandas.core.frame.DataFrame'>
Index: 4074 entries, 2 to 9205
Columns: 32584 entries, 00 to FINAL Sentiment
dtypes: float64(6604), int64(25979), object(1)
memory usage: 1012.8+ MB


In [None]:
# Filter rows where 'FINAL Sentiment' column is 'NEUTRAL'
df_neutral = df_combined_9k_filled_reordered[df_combined_9k_filled_reordered['FINAL Sentiment'] == 'NEUTRAL']

# Get indices of filtered rows
df_neutral_indices = df_neutral.index
print(df_neutral_indices)

# Create new DataFrame with row numbers and 'FINAL Sentiment'
df_neutral1 = pd.DataFrame({'Row Number': df_neutral_indices,'FINAL Sentiment': df_neutral['FINAL Sentiment']})

df_neutral1.info()
#df_neutral1.to_csv('df_neutral1.csv',index = False)

Index([   0,    1,    3,    4,    5,    6,    7,    8,    9,   10,
       ...
       9189, 9190, 9191, 9192, 9193, 9194, 9197, 9199, 9202, 9206],
      dtype='int64', length=5133)
<class 'pandas.core.frame.DataFrame'>
Index: 5133 entries, 0 to 9206
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Row Number       5133 non-null   int64 
 1   FINAL Sentiment  5133 non-null   object
dtypes: int64(1), object(1)
memory usage: 120.3+ KB


In [None]:
# Delete the 'FINAL Sentiment' column
df_filtered1 = df_filtered.drop(columns=['FINAL Sentiment'])
df_filtered1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4074 entries, 2 to 9205
Columns: 32583 entries, 00 to 767
dtypes: float64(6604), int64(25979)
memory usage: 1012.8 MB


In [None]:
# Get feature names seen during fit time
fit_time_features1 = loaded_posneg_model.feature_names_in_
#fit_time_features = logreg_model1.feature_names_in_
print(fit_time_features1)
print(len(fit_time_features1))

['00' '000' '00025' '001' '002' '003' '004' '00452' '005' '0056' '006'
 '007' '008' '009' '00958' '01' '010' '011' '0117' '011713' '012'
 '01252013' '013' '0131' '015' '016' '017' '018' '019' '02' '020' '0204'
 '021' '0215' '022' '024' '025' '026' '027' '028' '028032' '028share'
 '029' '03' '030' '0305' '030share' '031' '032' '033' '034' '035' '036'
 '037' '038' '039' '0398' '04' '040' '04012006' '040share' '0418' '042'
 '043' '043share' '044' '047' '048' '05' '050' '052' '054' '056' '057'
 '059' '05sp' '06' '060' '061' '061share' '062' '063' '064' '065xno' '066'
 '067' '067share' '069' '07' '0700' '072' '073' '075' '078' '079' '08'
 '080' '081' '083' '084' '086' '087' '089' '09' '090' '091' '092' '093'
 '0930' '094' '0941' '0o' '10' '100' '1000' '10000' '100000' '10005'
 '1000point' '1002' '1003' '1005' '1007' '1008' '1009' '100k' '100kbpd'
 '100m' '100ma' '100million' '100mn' '100pm' '100sma' '100th' '101' '1010'
 '10100' '1014' '1015' '1017' '1019' '10199' '102' '1020' '102312'
 '10

In [None]:
#Get feature names in test data
test_data_features1 = df_filtered1.columns
print(test_data_features1)
print(len(test_data_features1))

Index(['00', '000', '000063', '0001', '00025', '0008', '001', '00117', '00133',
       '002',
       ...
       '758', '759', '760', '761', '762', '763', '764', '765', '766', '767'],
      dtype='object', length=32583)
32583


In [None]:
# Identify features present in test data but not in training
missing_features3 = [feature for feature in df_filtered1.columns if
                     feature not in loaded_posneg_model.feature_names_in_]

# Remove features from test data
df_filtered1_rmExcess = df_filtered1.drop(columns=missing_features3,
                                              errors='ignore')

df_filtered1_rmExcess.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4074 entries, 2 to 9205
Columns: 18639 entries, 00 to 767
dtypes: float64(5638), int64(13001)
memory usage: 579.4 MB


In [None]:
# Identify features present during training but are missing in the test data
missing_features4 = [feature for feature in
                     loaded_posneg_model.feature_names_in_ if feature not in
                     df_filtered1_rmExcess.columns]

print(len(missing_features4))

0


In [None]:
test_list = df_filtered1_rmExcess.columns

diff1 = [item for item in fit_time_features1 if item not in test_list]
diff2 = [item for item in test_list if item not in fit_time_features1]

print(diff1)
print(diff2)

[]
[]


In [None]:
df_filtered1_rmExcess_reordered = df_filtered1_rmExcess[loaded_posneg_model.feature_names_in_]
df_filtered1_rmExcess_reordered.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4074 entries, 2 to 9205
Columns: 19883 entries, 00 to 767
dtypes: float64(6882), int64(13001)
memory usage: 618.0 MB


In [None]:
df_filtered1_rmExcess_reordered.head(5)

Unnamed: 0,00,000,00025,001,002,003,004,00452,005,0056,...,761,762,763,764,765,765.1,766,766.1,767,767.1
2,0,0,0,0,0.0,0,0.0,0,0,0,...,0.0,0.417162,-0.08037,0.0,-0.201294,-0.201294,0.0,0.0,-0.002381,-0.002381
11,0,0,0,0,0.0,0,0.0,0,0,0,...,0.0,0.525658,-0.456155,0.0,-0.226899,-0.226899,0.0,0.0,-0.413062,-0.413062
13,0,0,0,0,0.0,0,0.0,0,0,0,...,0.0,0.089547,-0.431798,0.0,0.288734,0.288734,0.0,0.0,-0.031621,-0.031621
14,0,0,0,0,0.0,0,0.0,0,0,0,...,0.0,0.185645,-0.250878,0.0,0.01842,0.01842,0.0,0.0,-0.12654,-0.12654
15,0,0,0,0,0.0,0,0.0,0,0,0,...,0.0,0.269082,-0.377112,0.0,-0.066565,-0.066565,0.0,0.0,-0.2328,-0.2328


In [None]:
df_drop_dup = df_filtered1_rmExcess_reordered.loc[:, ~df_filtered1_rmExcess_reordered.columns.duplicated()]
df_drop_dup.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4074 entries, 2 to 9205
Columns: 17984 entries, 00 to 764
dtypes: float64(4983), int64(13001)
memory usage: 559.0 MB


In [None]:
df_magic = df_drop_dup[loaded_posneg_model.feature_names_in_]
df_magic.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4074 entries, 2 to 9205
Columns: 18606 entries, 00 to 767
dtypes: float64(5605), int64(13001)
memory usage: 578.3 MB


- Calculating prediction speed for Positive/Negative

In [None]:
## Make predictions using the trained model
##-for logistic regression only
start_time2 = time.time()
y_pred_9k1 = loaded_posneg_model.predict(df_magic)
#y_pred_9k = logreg_model2.predict(df_filtered)
end_time2 = time.time()
time_all2 = end_time2 - start_time2

#print(y_pred_9k1)
#print("Time taken for all predictions: ", time_all2, " seconds")
print("Time taken for one prediction: ", time_all2/num_rows, " seconds")

['NEGATIVE' 'NEGATIVE' 'NEGATIVE' 'NEGATIVE' 'NEGATIVE' 'NEGATIVE'
 'NEGATIVE' 'POSITIVE' 'NEGATIVE' 'POSITIVE' 'NEGATIVE' 'NEGATIVE'
 'NEGATIVE' 'POSITIVE' 'NEGATIVE' 'NEGATIVE' 'NEGATIVE' 'POSITIVE'
 'NEGATIVE' 'NEGATIVE' 'POSITIVE' 'POSITIVE' 'NEGATIVE' 'NEGATIVE'
 'NEGATIVE' 'POSITIVE' 'NEGATIVE' 'POSITIVE' 'NEGATIVE' 'NEGATIVE'
 'NEGATIVE' 'NEGATIVE' 'NEGATIVE' 'POSITIVE' 'NEGATIVE' 'NEGATIVE'
 'POSITIVE' 'POSITIVE' 'POSITIVE' 'NEGATIVE' 'NEGATIVE' 'NEGATIVE'
 'POSITIVE' 'NEGATIVE' 'NEGATIVE' 'NEGATIVE' 'NEGATIVE' 'NEGATIVE'
 'NEGATIVE' 'NEGATIVE' 'POSITIVE' 'NEGATIVE' 'NEGATIVE' 'NEGATIVE'
 'NEGATIVE' 'NEGATIVE' 'NEGATIVE' 'NEGATIVE' 'NEGATIVE' 'NEGATIVE'
 'POSITIVE' 'NEGATIVE' 'NEGATIVE' 'POSITIVE' 'NEGATIVE' 'NEGATIVE'
 'POSITIVE' 'POSITIVE' 'POSITIVE' 'POSITIVE' 'NEGATIVE' 'NEGATIVE'
 'POSITIVE' 'NEGATIVE' 'POSITIVE' 'NEGATIVE' 'NEGATIVE' 'POSITIVE'
 'NEGATIVE' 'POSITIVE' 'NEGATIVE' 'NEGATIVE' 'NEGATIVE' 'NEGATIVE'
 'POSITIVE' 'POSITIVE' 'POSITIVE' 'POSITIVE' 'POSITIVE' 'NEGAT

- Total time

In [None]:
print("Time taken for all predictions: ", time_all1 + time_all2, " seconds")
print("Time taken for one prediction: ", (time_all1 + time_all2)/num_rows,
      "seconds")

Time taken for all predictions:  3.42962908744812  seconds
Time taken for one prediction:  0.00037250234467775826 seconds


In [None]:
# Add predictions to the DataFrame
df_magic['FINAL Sentiment'] = y_pred_9k1

# Create a new DataFrame excluding rows with 'neutral' sentiment
df_positive = df_magic[df_magic
 ['FINAL Sentiment'] == 'POSITIVE'].copy()

df_negative = df_magic[df_magic
 ['FINAL Sentiment'] == 'NEGATIVE'].copy()

# Get indices of filtered rows
df_positive_indices = df_positive.index
print(df_positive_indices)
df_negative_indices = df_negative.index
print(df_negative_indices)

# Create new DataFrame with row numbers and 'FINAL Sentiment'
df_positive1 = pd.DataFrame({'Row Number': df_positive_indices,'FINAL Sentiment': df_positive['FINAL Sentiment']})
df_negative1 = pd.DataFrame({'Row Number': df_negative_indices,'FINAL Sentiment': df_negative['FINAL Sentiment']})

df_positive1.info()
df_negative1.info()

#df_positive1.to_csv('df_positive1.csv',index = False)
#df_negative1.to_csv('df_negative1.csv',index = False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_magic['FINAL Sentiment'] = y_pred_9k1


Index([  22,   27,   33,   45,   56,   57,   70,   89,  101,  105,
       ...
       9169, 9173, 9182, 9186, 9187, 9195, 9198, 9200, 9201, 9204],
      dtype='int64', length=1586)
Index([   2,   11,   13,   14,   15,   18,   21,   26,   28,   30,
       ...
       9168, 9170, 9174, 9175, 9176, 9183, 9188, 9196, 9203, 9205],
      dtype='int64', length=2488)
<class 'pandas.core.frame.DataFrame'>
Index: 1586 entries, 22 to 9204
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Row Number       1586 non-null   int64 
 1   FINAL Sentiment  1586 non-null   object
dtypes: int64(1), object(1)
memory usage: 37.2+ KB
<class 'pandas.core.frame.DataFrame'>
Index: 2488 entries, 2 to 9205
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Row Number       2488 non-null   int64 
 1   FINAL Sentiment  2488 non-null   object
dtypes: int64(1), object(1

In [None]:
# Concatenate the DataFrames based on row numbers
senti3_df = pd.concat([df_neutral1,df_positive1,df_negative1],axis = 0)

# Sort the combined DataFrame based on the 'Row Number' column
senti3_df.sort_values(by='Row Number',inplace=True)

senti3_df.info()
senti3_df.head(20)

<class 'pandas.core.frame.DataFrame'>
Index: 9207 entries, 0 to 9206
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Row Number       9207 non-null   int64 
 1   FINAL Sentiment  9207 non-null   object
dtypes: int64(1), object(1)
memory usage: 215.8+ KB


Unnamed: 0,Row Number,FINAL Sentiment
0,0,NEUTRAL
1,1,NEUTRAL
2,2,NEGATIVE
3,3,NEUTRAL
4,4,NEUTRAL
5,5,NEUTRAL
6,6,NEUTRAL
7,7,NEUTRAL
8,8,NEUTRAL
9,9,NEUTRAL


In [None]:
#senti3_df.to_csv('senti3_df.csv',index=False)

In [None]:
# Read the manually labelled data
df_9k = pd.read_csv('combined_csv_dupDropped_9kUnlabelled.csv')

# Columns to drop
columns_to_drop = ['Sentiment1','Sentiment2','FINAL Sentiment']

# Assuming df_9k is your DataFrame and columns_to_drop is a list of column names to drop
df_9k.drop(columns=columns_to_drop, inplace=True)

df_9k.info()

# Concatenate the DataFrames based on row numbers
df_9k_sentiLabelled = pd.concat([df_9k,senti3_df],axis = 1)

# Drop the 'Row Number' column
df_9k_sentiLabelled.drop(columns='Row Number', inplace=True)

df_9k_sentiLabelled.info()
df_9k_sentiLabelled.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9207 entries, 0 to 9206
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Headline     9207 non-null   object
 1   Source       9207 non-null   object
 2   Posted       9207 non-null   object
 3   Description  9207 non-null   object
 4   Link         9207 non-null   object
dtypes: object(5)
memory usage: 359.8+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9207 entries, 0 to 9206
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Headline         9207 non-null   object
 1   Source           9207 non-null   object
 2   Posted           9207 non-null   object
 3   Description      9207 non-null   object
 4   Link             9207 non-null   object
 5   FINAL Sentiment  9207 non-null   object
dtypes: object(6)
memory usage: 431.7+ KB


Unnamed: 0,Headline,Source,Posted,Description,Link,FINAL Sentiment
0,Warren Buffett Has $159 Billion Invested in Th...,Motley Fool via Yahoo Finance,31/3/2024,Buffett has invested $159 billion in these eig...,https://finance.yahoo.com/news/warren-buffett-...,NEUTRAL
1,Is It Safe to Buy Stocks With the S&P 500 at a...,Motley Fool via Yahoo Finance,31/3/2024,"Horton, and StoneCo, substantially reduced its...",https://finance.yahoo.com/news/safe-buy-stocks...,NEUTRAL
2,Investment Advisory Services Inc. TX ADV Acqui...,ETF DAILY NEWS,31/3/2024,Investment Advisory Services Inc. TX ADV incre...,https://www.etfdailynews.com/2024/03/31/invest...,NEGATIVE
3,Spring Cleaning: 3 Stocks to Say Goodbye to Ri...,Motley Fool via Yahoo Finance,31/3/2024,Image source: Getty Images. Will the rise of A...,https://finance.yahoo.com/news/spring-cleaning...,NEUTRAL
4,Intel's Lunar Lake Could Be a Threat to Apple'...,Motley Fool via Yahoo Finance,30/3/2024,"Apple (NASDAQ: AAPL), by virtue of controlling...",https://finance.yahoo.com/news/intels-lunar-la...,NEUTRAL
5,Will Warren Buffett Sell More Apple Stock Beca...,Motley Fool via Yahoo Finance,30/3/2024,Warren Buffett has been a big fan of Apple (NA...,https://finance.yahoo.com/news/warren-buffett-...,NEUTRAL
6,Here's How Much Dividend Income You'd Make by ...,Motley Fool via Yahoo Finance,31/3/2024,Microsoft (NASDAQ: MSFT) initiated its dividen...,https://finance.yahoo.com/news/heres-much-divi...,NEUTRAL
7,Is Trending Stock Apple Inc. (AAPL) a Buy Now?,Zacks via Yahoo Finance,29/3/2024,Apple (AAPL) has been one of the most searched...,https://finance.yahoo.com/news/trending-stock-...,NEUTRAL
8,"Should You Buy This ""Magnificent Seven"" Stock ...",Motley Fool via Yahoo Finance,30/3/2024,"The ""Magnificent Seven,"" a group of highly acc...",https://finance.yahoo.com/news/buy-magnificent...,NEUTRAL
9,5 big analyst AI moves: Apple-Baidu collaborat...,Investing.com,31/3/2024,According to a report from the Wall Street Jou...,https://ng.investing.com/news/stock-market-new...,NEUTRAL


In [None]:
df_9k_sentiLabelled.to_csv('df_9k_sentiLabelled.csv',index=False)

# Random Accuracy Test

In [None]:
import random

# Generate a random number from the list of rows to choose the row in which the model is tested on
random_int = random.randint(0, (len(df_9k)-1))
print("Random row chosen:", random_int)

Random row chosen: 890


In [None]:
import pandas as pd

#Set the maximum column width to display the full content
pd.set_option('display.max_colwidth',None)

random_row = pd.DataFrame(df_9k_sentiLabelled.iloc[888]).transpose() #shows the 890th row
print("Random row:")
print(random_row)


Random row:
                                                                    Headline  \
888  Microsoft Co. (NASDAQ:MSFT) Shares Sold by Marshall Financial Group LLC   

             Source     Posted  \
888  ETF DAILY NEWS  26/3/2024   

                                                                                                                                                                        Description  \
888  Marshall Financial Group LLC lowered its holdings in Microsoft Co. (NASDAQ:MSFT – Free Report) by 3.1% in the fourth quarter, according to its most recent Form 13F filing ...   

                                                                                                             Link  \
888  https://www.etfdailynews.com/2024/03/26/microsoft-co-nasdaqmsft-shares-sold-by-marshall-financial-group-llc/   

    FINAL Sentiment  
888        NEGATIVE  


In [2]:
import random
# Sentiment generated by random chance

# Define the three sentiments
Senti = ['POSITIVE','NEGATIVE','NEUTRAL']

# Select random text string
random_senti = random.choice(Senti)

#Print the randomly selected Sentiment
print("Randomly selected Sentiment:",random_senti)

Randomly selected Sentiment: POSITIVE
