In [2]:
# Install and import required libraries
import pandas as pd
import numpy as np
from google.colab import files
import io

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [3]:
print("\n Please upload your 'Data.csv' file:")
uploaded = files.upload()
filename = list(uploaded.keys())[0]
df = pd.read_csv(io.BytesIO(uploaded[filename]), encoding="ISO-8859-1")
print(f"\nLoaded '{filename}' with shape {df.shape}\n")
print(df.head())


 Please upload your 'Data.csv' file:


Saving Data.csv to Data.csv

Loaded 'Data.csv' with shape (4101, 27)

         Date  Label                                               Top1  \
0  2000-01-03      0  A 'hindrance to operations': extracts from the...   
1  2000-01-04      0                                          Scorecard   
2  2000-01-05      0                  Coventry caught on counter by Flo   
3  2000-01-06      1                      Pilgrim knows how to progress   
4  2000-01-07      1                               Hitches and Horlocks   

                                 Top2  \
0                           Scorecard   
1                 The best lake scene   
2  United's rivals on the road to Rio   
3                 Thatcher facing ban   
4      Beckham off but United survive   

                                            Top3  \
0                Hughes' instant hit buoys Blues   
1                  Leader: German sleaze inquiry   
2  Thatcher issues defence before trial by video   
3        McIlroy calls f

In [5]:
# Split into train and test based on Date
print("\n Splitting data into training (Date < 2015-01-01) and test (Date ≥ 2015-01-01)...")
train = df[df['Date'] < '20150101'].copy()
test  = df[df['Date'] > '20141231'].copy()
print(f"Training samples: {len(train)}, Test samples: {len(test)}")


 Splitting data into training (Date < 2015-01-01) and test (Date ≥ 2015-01-01)...
Training samples: 3975, Test samples: 378


In [7]:
# Text cleaning (remove non-letters)
print("\n Cleaning headlines (keep only letters)...")
data = train.iloc[:, 2:27].replace("[^a-zA-Z]", " ", regex=True)
data.columns = [str(i) for i in range(data.shape[1])]
print(data.head(1))

# Lowercasing all text
print("\n Converting to lowercase...")
for col in data.columns:
    data[col] = data[col].str.lower()
print("Sample cleaned row:\n", data.iloc[0].tolist())


 Cleaning headlines (keep only letters)...
                                                   0          1  \
0  A  hindrance to operations   extracts from the...  Scorecard   

                                 2                                         3  \
0  Hughes  instant hit buoys Blues  Jack gets his skates on at ice cold Alex   

                                        4  \
0  Chaos as Maracana builds up for United   

                                                   5  \
0  Depleted Leicester prevail as Elliott spoils E...   

                                  6                                  7  \
0  Hungry Spurs sense rich pickings  Gunners so wide of an easy target   

                                               8  \
0  Derby raise a glass to Strupar s debut double   

                                          9  ...  \
0  Southgate strikes  Leeds pay the penalty  ...   

                                         15  \
0  Flintoff injury piles on woe for England   

  

In [9]:
# Combine columns into full headlines
print("\n Combining columns into single headline strings...")
# Convert all values to strings before joining to handle potential non-string types (like floats for NaN)
headlines = data.apply(lambda row: " ".join(row.values.astype(str)), axis=1)
print("Example headline:", headlines.iloc[0])

# Vectorize using bigrams (2-word sequences)
print("\n Creating bag-of-bigrams model...")
vectorizer = CountVectorizer(ngram_range=(2, 2))
X_train = vectorizer.fit_transform(headlines)
y_train = train['Label'].values
print(f"Vocabulary size (bigrams): {len(vectorizer.vocabulary_)}")


 Combining columns into single headline strings...
Example headline: a  hindrance to operations   extracts from the leaked reports scorecard hughes  instant hit buoys blues jack gets his skates on at ice cold alex chaos as maracana builds up for united depleted leicester prevail as elliott spoils everton s party hungry spurs sense rich pickings gunners so wide of an easy target derby raise a glass to strupar s debut double southgate strikes  leeds pay the penalty hammers hand robson a youthful lesson saints party like it s      wear wolves have turned into lambs stump mike catches testy gough s taunt langer escapes to hit     flintoff injury piles on woe for england hunters threaten jospin with new battle of the somme kohl s successor drawn into scandal the difference between men and women sara denver  nurse turned solicitor diana s landmine crusade put tories in a panic yeltsin s resignation caught opposition flat footed russian roulette sold out recovering a title

 Creating bag-of-

In [10]:
#  Train Random Forest classifier
print("\n Training Random Forest classifier...")
rf = RandomForestClassifier(n_estimators=200, criterion='entropy', random_state=0)
rf.fit(X_train, y_train)
print("Model training complete!")


 Training Random Forest classifier...
Model training complete!


In [11]:
# Prepare test headlines and transform
print("\n Cleaning and transforming test headlines...")
test_data = test.iloc[:, 2:27].replace("[^a-zA-Z]", " ", regex=True).astype(str)
for col in test_data.columns:
    test_data[col] = test_data[col].str.lower()
X_test_raw = test_data.apply(lambda row: " ".join(row.values), axis=1)
X_test = vectorizer.transform(X_test_raw)


 Cleaning and transforming test headlines...


In [12]:
# Predict and evaluate on test set
print("\n🔮 Making predictions on test set...")
y_test = test['Label'].values
y_pred = rf.predict(X_test)

print("\n📊 Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print(f"\n✅ Accuracy: {accuracy_score(y_test, y_pred):.4f}\n")
print("📝 Classification Report:")
print(classification_report(y_test, y_pred))


🔮 Making predictions on test set...

📊 Confusion Matrix:
[[142  44]
 [ 11 181]]

✅ Accuracy: 0.8545

📝 Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.76      0.84       186
           1       0.80      0.94      0.87       192

    accuracy                           0.85       378
   macro avg       0.87      0.85      0.85       378
weighted avg       0.87      0.85      0.85       378

