In [23]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

In [None]:
train_data = pd.read_csv('train_data.csv')                                              # Reads train_data csv file created earlier to DataFrame
validation_data = pd.read_csv('validation_data.csv')                                    # Reads validation_data csv file created earlier to DataFrame

In [3]:
# Extract content and labels
X_train = train_data['stemmed_tokens']
y_train = train_data['type']
X_val = validation_data['stemmed_tokens']
y_val = validation_data['type']

# Convert text to a bag-of-words representation
vectorizer = CountVectorizer(max_features=10000)  # Use 10.000 most used words as features
X_train_bow = vectorizer.fit_transform(X_train)
X_val_bow = vectorizer.transform(X_val)

# Check shape of resulting matrix (should be (sample_size, 10000))
print(X_train_bow.shape)
print(X_val_bow.shape)

(609283, 10000)
(76160, 10000)


1500 er nok til at den konvergerer, den bliver ikke bedre af 2000 vs 1500 :)) \
Jeg har ikke fjernet satire i Part1Task4 :)) Så stor forskel gør det alligevel ikke

Task 1

In [4]:
# Train logistic regression model
model = LogisticRegression(max_iter=1500) 
model.fit(X_train_bow, y_train)

# Predictions
y_val_pred = model.predict(X_val_bow)

# Evaluate the model
accuracy = accuracy_score(y_val, y_val_pred)
f1 = f1_score(y_val, y_val_pred, pos_label=1)
report = classification_report(y_val, y_val_pred)

# Print results
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print("Classification Report:\n", report)

Accuracy: 0.8425
F1 Score: 0.8369
Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.85      0.85     39503
           1       0.83      0.84      0.84     36657

    accuracy                           0.84     76160
   macro avg       0.84      0.84      0.84     76160
weighted avg       0.84      0.84      0.84     76160



Task 1 optimeret

In [5]:
# Train logistic regression model
# Her med class-weight=balanced
model = LogisticRegression(max_iter=1500, class_weight='balanced') 
model.fit(X_train_bow, y_train)

# Predictions
y_val_pred = model.predict(X_val_bow)

# Evaluate the model
accuracy = accuracy_score(y_val, y_val_pred)
f1 = f1_score(y_val, y_val_pred, pos_label=1)
report = classification_report(y_val, y_val_pred)

# Print results
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print("Classification Report:\n", report)

Accuracy: 0.8424
F1 Score: 0.8393
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.83      0.85     39503
           1       0.82      0.86      0.84     36657

    accuracy                           0.84     76160
   macro avg       0.84      0.84      0.84     76160
weighted avg       0.84      0.84      0.84     76160



Task 1 mere optimeret

In [None]:
# Train logistic regression model
# Her med class-weight=balanced og C=8.0
model = LogisticRegression(max_iter=1500, class_weight='balanced', C=8.0) 
model.fit(X_train_bow, y_train)

# Predictions
y_val_pred = model.predict(X_val_bow)

# Evaluate the model
accuracy = accuracy_score(y_val, y_val_pred)
f1 = f1_score(y_val, y_val_pred, pos_label=1)
report = classification_report(y_val, y_val_pred)

# Print results
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print("Classification Report:\n", report)

Accuracy: 0.8423
F1 Score: 0.8392
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.83      0.85     39503
           1       0.82      0.85      0.84     36657

    accuracy                           0.84     76160
   macro avg       0.84      0.84      0.84     76160
weighted avg       0.84      0.84      0.84     76160



Task 2 med domain som features (ud over main-text selvfølgelig) - er det snyd? Det er meget muligt. Men de skriver ikke, at det er ulovligt. Så vi kunne evt. skrive, at det er snyd at bruge domain som features, når det er det, der er brugt til at definere label'sne - og derfor modellen bliver så helt ekstremt god, fordi den basically får svarene givet.

In [17]:
# Encode 'domain' as a numerical feature
encoder = OneHotEncoder(handle_unknown='ignore')  # Handle unseen domains gracefully
X_train_domain = encoder.fit_transform(train_data[['domain']])
X_val_domain = encoder.transform(validation_data[['domain']])

# Combine text features and domain features
X_train_combined = hstack([X_train_bow, X_train_domain])                # Horizontally stack sparse matrices
X_val_combined = hstack([X_val_bow, X_val_domain])                      # Horizontally stack sparse matrices

# Check the resulting feature shape
print(f"Final Feature Matrix Shape (training): {X_train_combined.shape}")  # (rows, num_features)
print(f"Final Feature Matrix Shape (validation): {X_val_combined.shape}")  # (rows, num_features)

Final Feature Matrix Shape (training): (609283, 10524)
Final Feature Matrix Shape (validation): (76160, 10524)


In [18]:
# Train Logistic Regression model
model = LogisticRegression(max_iter=1500)
model.fit(X_train_combined, y_train)

# Predictions
y_val_pred = model.predict(X_val_combined)

# Evaluate the model
accuracy = accuracy_score(y_val, y_val_pred)
f1 = f1_score(y_val, y_val_pred, pos_label=1)
report = classification_report(y_val, y_val_pred)

# Print results
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print("Classification Report:\n", report)

Accuracy: 0.9995
F1 Score: 0.9995
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     39503
           1       1.00      1.00      1.00     36657

    accuracy                           1.00     76160
   macro avg       1.00      1.00      1.00     76160
weighted avg       1.00      1.00      1.00     76160



Task 2 med title som features (ud over main-text)

fillna("") replaces NaN values with an empty string, preventing TfidfVectorizer from crashing. The empty string ensures that missing titles do not introduce errors but contribute zero influence in the TF-IDF matrix :))

TF-IDF is generally better than just BOW for short texts like titles because it helps differentiate important words from common ones - this is relevant since we haven't preporcessed the text in the titles :)) \
Using title as features can help catch clickbait fake news


In [25]:
# Fill NaN values in the 'title' column with an empty string
train_data['title'] = train_data['title'].fillna("")
validation_data['title'] = validation_data['title'].fillna("")

# Encode 'title' as a numerical feature
vectorizer_title = TfidfVectorizer(max_features=5000)  # Convert title to TF-IDF
X_train_title = vectorizer_title.fit_transform(train_data['title'])
X_val_title = vectorizer_title.transform(validation_data['title'])

# Combine text features and title features
X_train_co_title = hstack([X_train_bow, X_train_title])                # Horizontally stack sparse matrices
X_val_co_title = hstack([X_val_bow, X_val_title])                      # Horizontally stack sparse matrices

# Check the resulting feature shape
print(f"Final Feature Matrix Shape (training): {X_train_co_title.shape}")  # (rows, num_features)
print(f"Final Feature Matrix Shape (validation): {X_val_co_title.shape}")  # (rows, num_features)

Final Feature Matrix Shape (training): (609283, 15000)
Final Feature Matrix Shape (validation): (76160, 15000)


In [28]:
# Train Logistic Regression model
model = LogisticRegression(max_iter=2500)
model.fit(X_train_co_title, y_train)

# Predictions
y_val_pred = model.predict(X_val_co_title)

# Evaluate the model
accuracy = accuracy_score(y_val, y_val_pred)
f1 = f1_score(y_val, y_val_pred, pos_label=1)
report = classification_report(y_val, y_val_pred)

# Print results
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print("Classification Report:\n", report)

Accuracy: 0.8732
F1 Score: 0.8678
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.88      0.88     39503
           1       0.87      0.86      0.87     36657

    accuracy                           0.87     76160
   macro avg       0.87      0.87      0.87     76160
weighted avg       0.87      0.87      0.87     76160



Stadig task 2 med title, her optimeret

In [29]:
# Train Logistic Regression model
model = LogisticRegression(max_iter=2500, class_weight='balanced')
model.fit(X_train_co_title, y_train)

# Predictions
y_val_pred = model.predict(X_val_co_title)

# Evaluate the model
accuracy = accuracy_score(y_val, y_val_pred)
f1 = f1_score(y_val, y_val_pred, pos_label=1)
report = classification_report(y_val, y_val_pred)

# Print results
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print("Classification Report:\n", report)

Accuracy: 0.8733
F1 Score: 0.8689
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.87      0.88     39503
           1       0.87      0.87      0.87     36657

    accuracy                           0.87     76160
   macro avg       0.87      0.87      0.87     76160
weighted avg       0.87      0.87      0.87     76160



Task 3, preprocessing er gjort i Part1Task2 sammen med processing a det store dataset :))

In [None]:
processed_data = pd.read_csv('processed_scraped_articles.csv')                                       # Reads processed scraped data created earlier to DataFrame

In [None]:
# Concatenate stemmed tokens from processed data to X_train
X_train_extended = pd.concat([X_train, processed_data['stemmed_tokens']], ignore_index=True)

# Concatenate type column (0) to y_train
y_train_extended = pd.concat([y_train, processed_data['type']], ignore_index=True)

# Control that concatenation happened correctly
print(processed_data.shape)                        # should be (713, 2)
print(X_train.shape)                               # should be (xx, )
print(X_train_extended.shape)                      # should be (xx+713, )
print(y_train.shape)                               # should be (xx, )
print(y_train_extended.shape)                      # should be (xx+713, )

(713, 2)
(609283,)
(609996,)
(609283,)
(609996,)


In [21]:
# Convert text to a bag-of-words representation
vectorizer2 = CountVectorizer(max_features=10000)  # Use 10.000 most used words as features
X_train_ext_bow = vectorizer2.fit_transform(X_train_extended)
X_val_ext_bow = vectorizer2.transform(X_val)

# Check shape of resulting matrix (should be (sample_size, 10000))
print(X_train_ext_bow.shape)
print(X_val_ext_bow.shape)

(609996, 10000)
(76160, 10000)


In [22]:
# Train logistic regression model
model = LogisticRegression(max_iter=1500, class_weight='balanced') 
model.fit(X_train_ext_bow, y_train_extended)

# Predictions
y_val_pred = model.predict(X_val_ext_bow)

# Evaluate the model
accuracy = accuracy_score(y_val, y_val_pred)
f1 = f1_score(y_val, y_val_pred, pos_label=1)
report = classification_report(y_val, y_val_pred)

# Print results
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print("Classification Report:\n", report)

Accuracy: 0.8426
F1 Score: 0.8395
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.83      0.85     39503
           1       0.82      0.86      0.84     36657

    accuracy                           0.84     76160
   macro avg       0.84      0.84      0.84     76160
weighted avg       0.84      0.84      0.84     76160



Det hjalp kun meeeeeget lidt, men det er lidt som forventet når man smider 713 artikler på et training-set der indeholder 600.000 artikler :))