In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.linear_model import LogisticRegression
from nltk.corpus import stopwords
import numpy as np
from sklearn.decomposition import PCA
import plotly.express as px
import preprocessing
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score

# import nltk
# nltk.download('punkt')
# nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

In [42]:
input_file = "./article_dataset.csv"
df_raw = pd.read_csv(input_file, index_col="id")
df_raw = preprocessing.clean_df(df_raw)
df_raw = df_raw[df_raw['cleaned_text'].str.strip() != ""].copy()
df_raw.reset_index(inplace=True)
df_raw['id'] = df_raw.index
df_raw["clean_title"] = df_raw['title'].apply(preprocessing.clean_text)

df = df_raw.copy()
df = preprocessing.turn_into_pu(df, 0.5)

X_train, X_test, y_train, y_test = preprocessing.test_train_split(df)
y_train.index = X_train.index
y_test_true = df_raw.loc[y_test.index, 'label']

X_train['tokens'] = X_train['cleaned_text'].apply(lambda x: x.split())
df.head()

Unnamed: 0,id,title,author,text,label,cleaned_text,clean_title
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,house dem aide didnt even see comeys letter ja...,house dem aide didnt even see comeys letter ja...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,1,ever get feeling life circle roundabout rather...,flynn hillary clinton big woman campus breitbart
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,truth might get fired october tension intellig...,truth might get fired
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,video civilian killed single u airstrike ident...,civilian killed single u airstrike identified
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1,print iranian woman sentenced six year prison ...,iranian woman jailed fictional unpublished sto...


In [43]:
vectorizer = TfidfVectorizer(tokenizer=lambda x: x.split(), lowercase=True)
tfidf_matrix = vectorizer.fit_transform(X_train['cleaned_text'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out(), index=X_train.index)

pca = PCA(n_components=30)
principal_components = pca.fit_transform(tfidf_df)
print("Explained Variance: ", pca.explained_variance_ratio_.cumsum())


The parameter 'token_pattern' will not be used since 'tokenizer' is not None'



Explained Variance:  [0.01361221 0.02223351 0.02836963 0.03401142 0.03859039 0.0422613
 0.04585219 0.04922678 0.05240225 0.05523649 0.0579083  0.060539
 0.06297905 0.06538413 0.06769855 0.06985503 0.07194448 0.07401531
 0.07598834 0.07790957 0.07976904 0.0815863  0.08334781 0.0850625
 0.08671475 0.08830115 0.08987172 0.09141517 0.09293597 0.09442402]


In [44]:
analyzer = SentimentIntensityAnalyzer()
def sentiment_maker(df: pd.DataFrame, type:str):
    df['sentiment'] = df[type].apply(analyzer.polarity_scores)
    df = pd.concat([df.drop(['sentiment'], axis=1), df['sentiment'].apply(pd.Series)], axis=1)
    df.rename(columns={col: f"{col}_{type}" for col in ["pos", "neu", "neg"]}, inplace=True)
    return df

def count_stop_words(tokens):
    stop_word_count = sum(1 for word in tokens if word in stop_words)
    return stop_word_count / len(tokens)

X_vec = X_train.copy()
X_vec['len'] = X_vec['tokens'].apply(lambda x: len(x))

X_vec = sentiment_maker(X_vec, "cleaned_text")
X_vec = sentiment_maker(X_vec, "clean_title")


X_vec['percent_stop'] = X_vec['tokens'].apply(count_stop_words)
X_vec

Unnamed: 0,id,title,author,text,cleaned_text,clean_title,tokens,len,neg_cleaned_text,neu_cleaned_text,pos_cleaned_text,compound,neg_clean_title,neu_clean_title,pos_clean_title,compound.1,percent_stop
14021,14021,A Song for Bill de Blasio Sounds a Sour Note f...,William Neuman,Mayor Bill de Blasio posted a video on his Cit...,mayor bill de blasio posted video city hall tw...,song bill de blasio sound sour note watchdog n...,"[mayor, bill, de, blasio, posted, video, city,...",386,0.047,0.806,0.146,0.9915,0.000,1.000,0.000,0.0000,0.023316
8663,8663,News Shot: Electronic Voting Machines Rigged f...,Lily Dane,Joe Joseph discusses a recent SHTFPlan.com art...,joe joseph discusses recent shtfplancom articl...,news shot electronic voting machine rigged cli...,"[joe, joseph, discusses, recent, shtfplancom, ...",57,0.082,0.793,0.125,0.4215,0.263,0.737,0.000,-0.3612,0.000000
19916,19916,US Plan “C” in Syria: Make “Al Qaeda Central” ...,Activist Post,"By Tony Cartalucci In reality, since even befo...",tony cartalucci reality since even syria confl...,u plan c syria make al qaeda central new capital,"[tony, cartalucci, reality, since, even, syria...",14,0.135,0.702,0.164,0.1280,0.000,1.000,0.000,0.0000,0.000000
18412,18412,Chart Of The Day: Miami Condo Market—–Choking ...,David Stockman,Chart Of The Day: Miami Condo Market Choking O...,chart day miami condo market choking inventory...,chart day miami condo marketchoking inventory,"[chart, day, miami, condo, market, choking, in...",56,0.050,0.863,0.088,0.0731,0.000,1.000,0.000,0.0000,0.000000
11694,11694,4 STEPS TO ENSURE YOUR AMMO STORES (VIRTUALLY)...,Iron Sheik,Home › GUNS › 4 STEPS TO ENSURE YOUR AMMO STOR...,home gun step ensure ammo store virtually fore...,step ensure ammo store virtually forever,"[home, gun, step, ensure, ammo, store, virtual...",218,0.078,0.730,0.192,0.9794,0.000,0.658,0.342,0.3818,0.027523
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18214,18214,Islamists Celebrate Erdogan’s Victory in Refer...,Ali Waked,Islamists around the Arab world celebrated Tur...,islamist around arab world celebrated turkish ...,islamist celebrate erdogans victory referendum...,"[islamist, around, arab, world, celebrated, tu...",227,0.026,0.661,0.313,0.9975,0.000,0.619,0.381,0.5719,0.000000
7498,7498,People in cities other than Delhi spray Fog de...,Amrut Thobbi,People in cities other than Delhi spray Fog de...,people city delhi spray fog deo get medium att...,people city delhi spray fog deo get medium att...,"[people, city, delhi, spray, fog, deo, get, me...",186,0.098,0.737,0.164,0.9132,0.000,1.000,0.000,0.0000,0.000000
8394,8394,Active Navy SEAL Investigated for Staring in P...,Warner Todd Huston,"A decorated, active Navy SEAL is under investi...",decorated active navy seal investigation moonl...,active navy seal investigated staring porn mov...,"[decorated, active, navy, seal, investigation,...",212,0.083,0.793,0.124,0.9153,0.126,0.631,0.243,0.3182,0.000000
11895,11895,Why Did Facebook Say I Was Dead? - The New Yor...,Katie Rogers,Have you ever wondered what it would be like t...,ever wondered would like attend funeral friday...,facebook say dead new york time,"[ever, wondered, would, like, attend, funeral,...",190,0.270,0.554,0.176,-0.9803,0.462,0.538,0.000,-0.6486,0.005263


In [45]:
X_vec_numeric = X_vec.select_dtypes(include=np.number).drop(columns=["id"])
principal_components_df = pd.DataFrame(principal_components)
temp_index = X_vec.index
X_vec_numeric.reset_index(inplace=True)
principal_components_df.reset_index(inplace=True)
x_train_combined = pd.concat([X_vec_numeric, principal_components_df], axis=1)
x_train_combined.drop(columns=['index'], inplace=True)
x_train_combined.set_index(temp_index, inplace=True)
x_train_combined.columns = x_train_combined.columns.astype(str)
x_train_combined

Unnamed: 0,len,neg_cleaned_text,neu_cleaned_text,pos_cleaned_text,compound,neg_clean_title,neu_clean_title,pos_clean_title,compound.1,percent_stop,...,20,21,22,23,24,25,26,27,28,29
14021,386,0.047,0.806,0.146,0.9915,0.000,1.000,0.000,0.0000,0.023316,...,-0.002936,0.008136,-0.006456,-0.026059,-0.047053,-0.008303,-0.005470,0.060986,0.007418,-0.009984
8663,57,0.082,0.793,0.125,0.4215,0.263,0.737,0.000,-0.3612,0.000000,...,-0.014373,-0.020790,-0.055768,0.002642,-0.036467,0.014691,-0.042516,0.044465,-0.010219,0.015338
19916,14,0.135,0.702,0.164,0.1280,0.000,1.000,0.000,0.0000,0.000000,...,0.011753,-0.014055,-0.002278,-0.020237,-0.038115,0.048599,-0.026391,-0.033721,0.030635,-0.012194
18412,56,0.050,0.863,0.088,0.0731,0.000,1.000,0.000,0.0000,0.000000,...,0.005615,0.057239,-0.045525,0.037926,0.096838,0.192871,0.008809,0.140314,-0.005218,0.121113
11694,218,0.078,0.730,0.192,0.9794,0.000,0.658,0.342,0.3818,0.027523,...,0.001571,0.012521,-0.019878,0.004815,-0.004327,-0.002294,0.010640,-0.002441,0.003487,-0.023510
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18214,227,0.026,0.661,0.313,0.9975,0.000,0.619,0.381,0.5719,0.000000,...,-0.000880,-0.015662,0.006521,0.006774,-0.032160,0.020931,-0.027814,-0.041325,-0.009325,-0.018736
7498,186,0.098,0.737,0.164,0.9132,0.000,1.000,0.000,0.0000,0.000000,...,0.019820,-0.009853,-0.029788,-0.001025,0.001128,0.002523,0.034027,0.006912,-0.036174,0.014851
8394,212,0.083,0.793,0.124,0.9153,0.126,0.631,0.243,0.3182,0.000000,...,0.008898,0.023698,-0.006160,-0.010577,-0.000437,0.000241,-0.010585,0.039981,0.031358,-0.018286
11895,190,0.270,0.554,0.176,-0.9803,0.462,0.538,0.000,-0.6486,0.005263,...,-0.058304,-0.028524,-0.031599,0.013817,-0.038477,0.020031,0.010092,-0.003640,0.003165,-0.009571


In [48]:
### Preprocessing X test
## Same preprocessing X train went through
## Utilizing fitted pca and tfidf
test_tfidf_matrix = vectorizer.transform(X_test['cleaned_text'])

test_tfidf = pd.DataFrame(test_tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out(), index=X_test.index)
test_pca = pd.DataFrame(pca.transform(test_tfidf))


X_test_vec = X_test.copy()
X_test_vec['tokens'] = X_test_vec['cleaned_text'].apply(lambda x: x.split())
X_test_vec['len'] = X_test_vec['tokens'].apply(lambda x: len(x))
X_test_vec = sentiment_maker(X_test_vec, "cleaned_text")
X_test_vec = sentiment_maker(X_test_vec, "clean_title")
X_test_vec['percent_stop'] = X_test_vec['tokens'].apply(count_stop_words)

X_vec_numeric = X_test_vec.select_dtypes(include=np.number).drop(columns=["id"])
x_test_index = X_test.index
X_vec_numeric.reset_index(inplace=True)
test_pca.reset_index(inplace=True)
x_test_processed = pd.concat([X_vec_numeric, test_pca], axis=1)
x_test_processed.set_index(x_test_index, inplace=True)
x_test_processed.columns = x_test_processed.columns.astype(str)
x_test_processed.drop(columns=['index'], inplace=True)

In [49]:
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train_combined)
x_test_scaled = scaler.transform(x_test_processed)
svm = SVC(kernel="linear", C=1.0, gamma="scale", probability=True, random_state=100)
svm.fit(x_train_scaled, y_train)

# y_pred = svm.predict(x_test_scaled)
y_probs = svm.predict_proba(x_test_scaled)[:,1]

In [57]:
percentages = np.arange(0, 1, 0.05)
accuracy = []
for cutoff in percentages:
    y_pred = [1 if proba > cutoff else 0 for proba in y_probs]
    accurate = sum(y_pred == y_test_true) / len(y_test_true)
    accuracy.append(accurate)
    
data = {
    'Cutoff': percentages,
    'Accuracy': accuracy
}

accutacy_table = pd.DataFrame(data)

# Create a line plot
fig = px.line(accutacy_table, x='Cutoff', y='Accuracy', title='Accuracy vs Cutoff',
              labels={'Cutoff': 'Threshold Cutoff', 'Accuracy': 'Accuracy'})
fig.show()

In [61]:
threshold = 0.8
y_pred_custom = (y_probs >= threshold).astype(int)

accuracy = accuracy_score(y_test_true, y_pred_custom)

print(f"Evaluation at Threshold: {threshold}")
print(f" Accuracy: {accuracy:.3f}")
print(classification_report(y_test_true, y_pred_custom))
print("y_test distribution:", np.bincount(y_test_true))
print("y_pred distribution:", np.bincount(y_pred_custom))

Evaluation at Threshold: 0.8
 Accuracy: 0.773
              precision    recall  f1-score   support

           0       0.78      0.77      0.77      3456
           1       0.76      0.78      0.77      3354

    accuracy                           0.77      6810
   macro avg       0.77      0.77      0.77      6810
weighted avg       0.77      0.77      0.77      6810

y_test distribution: [3456 3354]
y_pred distribution: [3381 3429]
