In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [2]:
#read in the data
df = pd.read_csv('kaggle-training/all-data.csv', header=None,encoding='latin-1')

df.head(10)

Unnamed: 0,0,1
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...
5,positive,FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...
6,positive,"For the last quarter of 2010 , Componenta 's n..."
7,positive,"In the third quarter of 2010 , net sales incre..."
8,positive,Operating profit rose to EUR 13.1 mn from EUR ...
9,positive,"Operating profit totalled EUR 21.1 mn , up fro..."


In [3]:
#df.columns = ['y','x']
#rename column names
df.columns = ['sentiment','headline']
print(df.head())

  sentiment                                           headline
0   neutral  According to Gran , the company has no plans t...
1   neutral  Technopolis plans to develop in stages an area...
2  negative  The international electronic industry company ...
3  positive  With the new production plant the company woul...
4  positive  According to the company 's updated strategy f...


In [4]:
print('row count: ', df.shape[0])

row count:  4846


In [5]:
#train_set, test_set = train_test_split(df,test_size=0.2)
#print("training set: ", train_set.shape)
#print("testing set: ", test_set.shape)

In [6]:
#split data into training and testing sets
X = df['headline']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=50)

In [8]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(3876,)
(970,)
(3876,)
(970,)


In [9]:
#extract feautures from data using term frequency-inverse document frequency (TD-IDF) vectorizer

vectorizer = TfidfVectorizer(stop_words='english',max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [10]:
print(X_train_vec)

  (0, 2670)	0.27091155179146137
  (0, 892)	0.1535385593823909
  (0, 1893)	0.29079430361892006
  (0, 4395)	0.3177735167393831
  (0, 4856)	0.24592881612880818
  (0, 2727)	0.27091155179146137
  (0, 2711)	0.27959691847289214
  (0, 1689)	0.2526177053524291
  (0, 2977)	0.20826775886910454
  (0, 1531)	0.2402226017989342
  (0, 4102)	0.20182547729393766
  (0, 4007)	0.1629946993096093
  (0, 1364)	0.2402226017989342
  (0, 2669)	0.18207489121987233
  (0, 3436)	0.23225143454958694
  (0, 4935)	0.26069960603912656
  (0, 2531)	0.24592881612880818
  (1, 1430)	0.4877202376201351
  (1, 2151)	0.5635331077302123
  (1, 3891)	0.23453057639456223
  (1, 2341)	0.46145757129399256
  (1, 4780)	0.42025197791226643
  (2, 4516)	0.15319320327413008
  (2, 1123)	0.1927442149645805
  (2, 2741)	0.19008552696582642
  :	:
  (3874, 313)	0.4650762956105486
  (3874, 1685)	0.41180586929068447
  (3874, 2129)	0.4771367201537468
  (3874, 3772)	0.4235475702833861
  (3874, 2654)	0.2753061123841253
  (3874, 1599)	0.22680166681042982

In [29]:
#train the model
model = LogisticRegression(multi_class='multinomial',solver='lbfgs', max_iter = 10000)
model.fit(X_train_vec,y_train)

LogisticRegression(max_iter=10000, multi_class='multinomial')

In [30]:
y_pred = model.predict(X_test_vec)


In [31]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Confusion Matrix
matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(matrix)

# Classification Report
report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)


Accuracy: 70.10%
Confusion Matrix:
[[ 46  66  26]
 [  6 523  28]
 [  9 155 111]]
Classification Report:
              precision    recall  f1-score   support

    negative       0.75      0.33      0.46       138
     neutral       0.70      0.94      0.80       557
    positive       0.67      0.40      0.50       275

    accuracy                           0.70       970
   macro avg       0.71      0.56      0.59       970
weighted avg       0.70      0.70      0.67       970



In [32]:
#make new predictions
new_headline = ["Apple to close down"]
new_headline_vec = vectorizer.transform(new_headline)
prediction = model.predict(new_headline_vec)
print(prediction)

['neutral']


In [37]:
#make new predictions
new_headline = ["YouTube surpasses Netflix as top video platform for teens: Survey"]
new_headline_vec = vectorizer.transform(new_headline)
prediction = model.predict(new_headline_vec)
print(prediction)

['neutral']


In [40]:
class_distribution = df['sentiment'].value_counts()
print(class_distribution)

#view as percent
class_distribution_percentage = df['sentiment'].value_counts(normalize=True) * 100
print(class_distribution_percentage)

neutral     2879
positive    1363
negative     604
Name: sentiment, dtype: int64
neutral     59.409823
positive    28.126290
negative    12.463888
Name: sentiment, dtype: float64


In [43]:
#from sklearn.model_selection import train_test_split

# Assuming df is your dataframe
X = df['headline']  # News headlines
y = df['sentiment']  # Sentiment

# Split data into training and testing sets (let's assume an 80-20 split for demonstration)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Convert X_train and y_train back to a dataframe for easy manipulation
df_train = pd.concat([X_train, y_train], axis=1)

# Perform undersampling on df_train
df_train_negative = df_train[df_train['sentiment'] == 'negative']
df_train_positive = df_train[df_train['sentiment'] == 'positive']
df_train_neutral = df_train[df_train['sentiment'] == 'neutral']

# Get the count of the minority class
minority_count = len(df_train_negative)

df_train_positive_downsampled = df_train_positive.sample(minority_count, random_state=42)
df_train_neutral_downsampled = df_train_neutral.sample(minority_count, random_state=42)

# Combine and shuffle
df_train_downsampled = pd.concat([df_train_negative, df_train_positive_downsampled, df_train_neutral_downsampled])
df_train_downsampled = df_train_downsampled.sample(frac=1, random_state=42).reset_index(drop=True)

# Extract X_train and y_train from df_train_downsampled for further processing
X_train_downsampled = df_train_downsampled['headline']
y_train_downsampled = df_train_downsampled['sentiment']


In [44]:

# 1. Feature Extraction
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train_downsampled)
X_test_vec = vectorizer.transform(X_test)


In [45]:

# 2. Model Training
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train_vec, y_train_downsampled)


LogisticRegression(max_iter=1000, random_state=42)

In [46]:

# 3. Evaluation
y_pred = model.predict(X_test_vec)

print("Classification Report:\n")
print(classification_report(y_test, y_pred))

Classification Report:

              precision    recall  f1-score   support

    negative       0.49      0.73      0.58       121
     neutral       0.78      0.73      0.75       576
    positive       0.55      0.50      0.52       273

    accuracy                           0.66       970
   macro avg       0.60      0.65      0.62       970
weighted avg       0.68      0.66      0.67       970



In [50]:
#make new predictions
new_headline = ["Apple to lay off 1000 workers"]
new_headline_vec = vectorizer.transform(new_headline)
prediction = model.predict(new_headline_vec)
print(prediction)

['negative']


The logistic regression model appears to consistently say that all headlines, whether positive, negative, or neutral, are neutral which is not the case. I am unsure yet if this is due to a problem with the dataset used for training (as my scraper is still running) or if it is the model itself. I will further experiment with other models.