In [1]:
import pandas as pd
import numpy as np

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
df = pd.read_csv('news.csv')
df

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL
...,...,...,...,...
4904,2034,Is it 2016 already?,"Jan. 1, 2016, is 859 days away. But, judging f...",REAL
4905,7671,It’s On: Between Duterte and America,shorty BY PETER LEE I ’ve written a couple pie...,FAKE
4906,2602,Boehner: Israel trip planned before Netanyahu-...,The Ohio Republican will travel to Israel this...,REAL
4907,6766,The US May Soon Face an Apocalyptic Seismic Event,"Today, an ever increasing number of earthquake...",FAKE


In [3]:
df.drop(['Unnamed: 0'] , axis = 'columns' , inplace = True)

In [4]:
df.isna().sum()

title    0
text     1
label    1
dtype: int64

In [5]:
df.dropna(inplace = True)

In [6]:
df.isna().sum()

title    0
text     0
label    0
dtype: int64

In [7]:
df['Fake'] = df.label.apply(lambda x: 1 if x == 'FAKE' else 0)

In [8]:
df

Unnamed: 0,title,text,label,Fake
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE,1
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE,1
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL,0
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE,1
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL,0
...,...,...,...,...
4903,Israeli Deputy Minister: Italy Quakes Retribut...,Edmondo Burr in Conspiracies // 0 Comments T...,FAKE,1
4904,Is it 2016 already?,"Jan. 1, 2016, is 859 days away. But, judging f...",REAL,0
4905,It’s On: Between Duterte and America,shorty BY PETER LEE I ’ve written a couple pie...,FAKE,1
4906,Boehner: Israel trip planned before Netanyahu-...,The Ohio Republican will travel to Israel this...,REAL,0


In [9]:
df.drop(['label'] , axis = 'columns' , inplace = True)
df

Unnamed: 0,title,text,Fake
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",1
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,1
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,0
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",1
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,0
...,...,...,...
4903,Israeli Deputy Minister: Italy Quakes Retribut...,Edmondo Burr in Conspiracies // 0 Comments T...,1
4904,Is it 2016 already?,"Jan. 1, 2016, is 859 days away. But, judging f...",0
4905,It’s On: Between Duterte and America,shorty BY PETER LEE I ’ve written a couple pie...,1
4906,Boehner: Israel trip planned before Netanyahu-...,The Ohio Republican will travel to Israel this...,0


In [10]:
X = df.drop(['Fake'] , axis = 'columns')
Y = df.Fake

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfid = TfidfVectorizer(max_features=1000)

In [12]:
x_tfid = tfid.fit_transform(X['text'])

# Convert the sparse matrix to a DataFrame
dfx = pd.DataFrame(x_tfid.toarray(), columns=tfid.get_feature_names_out())
dfx.shape


(4908, 1000)

In [28]:
from sklearn.model_selection import train_test_split
x_train , x_test , y_train , y_test = train_test_split(x_tfid , Y , test_size = 0.2 , random_state = 0)

In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from lightgbm import LGBMClassifier

In [30]:
model_params = {
    'Logistic Regression': {
        'model': LogisticRegression(),
        'params': {
            'C': [0.1, 1],  # Reduced range for regularization
            'solver': ['liblinear'],  # Limiting to a faster solver
        }
    },
    'Decision Tree': {
        'model': DecisionTreeClassifier(),
        'params': {
            # Only one criterion for faster evaluation
            'max_depth': [5, 10],
            # Adding a depth limit for optimization
        }
    },
    'Random Forest': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [10, 50],  # Fewer estimators
            'criterion': ['gini'],
        }
    },
    'Support Vector Classifier (SVC)': {
        'model': SVC(),
        'params': {
            'C': [1, 10],  # Focus on most common C values
            'kernel': ['linear'],  # Limit to linear kernel for speed
        }
    },
    'K-Nearest Neighbors (KNN)': {
        'model': KNeighborsClassifier(),
        'params': {
            'n_neighbors': [3, 5],  # Reduced neighbor options
        }
    }
}


In [31]:
from sklearn.model_selection import GridSearchCV
scores = []
for model_name , mp in model_params.items():
    clf = GridSearchCV(mp['model'] , mp['params'] , cv = 3 , return_train_score = False)
    clf.fit(x_train , y_train)
    scores.append({
        'model':model_name,
        'best score':clf.best_score_,
        'best_params' : clf.best_params_
    })
    df_score = pd.DataFrame(scores , columns =('model' , 'best score' , 'best_params') )
    df_score

In [32]:
df_score

Unnamed: 0,model,best score,best_params
0,Logistic Regression,0.89404,"{'C': 1, 'solver': 'liblinear'}"
1,Decision Tree,0.784262,{'max_depth': 5}
2,Random Forest,0.88283,"{'criterion': 'gini', 'n_estimators': 50}"
3,Support Vector Classifier (SVC),0.903974,"{'C': 1, 'kernel': 'linear'}"
4,K-Nearest Neighbors (KNN),0.794956,{'n_neighbors': 3}


In [39]:
from sklearn.ensemble import StackingClassifier
base_models = [
    ('rf', RandomForestClassifier(n_estimators=50)),
    ('kn', KNeighborsClassifier(n_neighbors = 3)),
    ('svm', SVC(kernel = 'linear'))
]
meta_model = LogisticRegression(solver = 'liblinear')
stacked_model = StackingClassifier(estimators = base_models, final_estimator=meta_model)
stacked_model.fit(x_train , y_train)
stacked_model.score(x_test , y_test)

0.9215885947046843

In [49]:
news_samples = [
    "You Can Smell Hillary’s Fear",
    "Watch The Exact Moment Paul Ryan Committed Pol...",
    "Kerry to go to Paris in gesture of sympathy",
    "Bernie supporters on Twitter erupt in anger ag...",
    "The Battle of New York: Why This Primary Matters",
    "Israeli Deputy Minister: Italy Quakes Retribution for UNESCO Vote",
    "Is it 2016 already?",
    "It’s On: Between Duterte and America",
    "Boehner: Israel trip planned before Netanyahu speech invite",
    "The US May Soon Face an Apocalyptic Seismic Event"
]


In [50]:
news_smaple_tfid = tfid.transform(news_samples)

# Convert the sparse matrix to a DataFrame
stacked_model.predict(news_smaple_tfid)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)

In [45]:
svm = SVC(kernel = 'linear')
svm.fit(x_train , y_train)
svm.predict(news_smaple_tfid)

array([1, 1, 1, 1, 1], dtype=int64)

In [46]:
y_predict = stacked_model.predict(x_test)

In [48]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix( y_test , y_predict)
cm

array([[470,  42],
       [ 35, 435]], dtype=int64)