# Rule-Based Decision Tree

In [None]:
!pip install -q tqdm

In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
import multiprocessing, warnings, re, string, os
from tqdm.auto import tqdm

tqdm.pandas()

In [26]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [27]:
DATA_PATH = "drive/MyDrive/all-the-news-2-1-SMALL-CLEANED.csv"
df = pd.read_csv(DATA_PATH)
print(df.head(3)[["publication", "clean_article", "split"]])

# Split provided by the file
train_df = df[df["split"] == "train"].reset_index(drop=True)
test_df  = df[df["split"] == "test"].reset_index(drop=True)

#Confirming
print(f"Train size: {len(train_df)}, Test size: {len(test_df)}")

          publication                                      clean_article  \
0  The New York Times   a love of [NAME] and slap bracelets, [NAME] s...   
1  The New York Times  warm, occasionally downright balmy, weather, a...   
2  The New York Times  dably confused. When he was a boy, Havana was ...   

   split  
0  train  
1  train  
2  train  
Train size: 90000, Test size: 10000


In [76]:
feature_keywords = {
    "politics_general": ["Trump", "white house", "congress"],
    "democrat_themes": ["Democrat", "liberal"],
    "republican_themes": ["Republican", "MAGA"],
    "personal": ["I ", "we ", "us "],
    "dateline": ["LONDON", "NEW YORK", "WASHINGTON", "VATICAN CITY", "BEIJING"],
    "formal verbs": ["said", "reported", "confirmed", "told", "announced", "stated", "added", "noted", "explained", "acknowledged", "claimed", "revealed", "outlined", "indicated", "suggested", "asserted", "responded", "disclosed", "emphasized",],
    "informal verbs": ["chimed in", "claimed", "demanded", "argued", "exposed", "called out", "blamed", "declared", "exclaimed", "hinted", "warned", "complained", "reported", "spoke", "shouted", "accused", "mocked"],
    "Economist_themes": ["regime", "GDP", "policymaker"],
    "Fox_News_themes": ["violent", "woke", ", D-", ", R-"],
    "People_themes": ["best", "exclusive", "new", "photos"],
    "Politio_themes": ["signal", "clear", "maneuver", "advance"],
    "The_Hill_themes": ["hearing", "midterm", "briefing", "aide", "committee", "chair", ""],
    "NYTimes_themes": ["Mr.", "Ms.", "Dr.", "Broadway"]
}

In [77]:
def extract_features(text):
    return [int(any(kw in text for kw in keywords)) for keywords in feature_keywords.values()]
train_df["features"] = train_df["clean_article"].apply(extract_features)
test_df["features"] = test_df["clean_article"].apply(extract_features)

In [78]:
# Turn feature list into DataFrame
X_train = pd.DataFrame(train_df["features"].tolist(), columns=feature_keywords.keys())
y_train = train_df["publication"]
X_test = pd.DataFrame(test_df["features"].tolist(), columns=feature_keywords.keys())
y_test = test_df["publication"]

# Train decision tree classifier
clf = DecisionTreeClassifier(max_depth=5, random_state=42)
clf.fit(X_train, y_train)

# Evaluate
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

                    precision    recall  f1-score   support

     Buzzfeed News       0.00      0.00      0.00      1000
               CNN       0.00      0.00      0.00      1000
         Economist       0.47      0.17      0.25      1000
          Fox News       0.00      0.00      0.00      1000
            People       0.16      0.71      0.26      1000
          Politico       0.31      0.46      0.37      1000
           Reuters       0.00      0.00      0.00      1000
          The Hill       0.35      0.56      0.43      1000
The New York Times       0.63      0.59      0.61      1000
              Vice       0.30      0.37      0.33      1000

          accuracy                           0.29     10000
         macro avg       0.22      0.29      0.23     10000
      weighted avg       0.22      0.29      0.23     10000

