# Model Building

In [1]:
import pandas as pd
import numpy as np

In [2]:
Data = pd.read_csv("Preprocessed_Data.csv")
Data.head()

Unnamed: 0.1,Unnamed: 0,title,text,subject,date,word2vec_1,word2vec_2,word2vec_3,word2vec_4,word2vec_5,...,word2vec_42,word2vec_43,word2vec_44,word2vec_45,word2vec_46,word2vec_47,word2vec_48,word2vec_49,word2vec_50,label
0,0,as us budget fight looms republicans flip thei...,washington reuters head conservative republica...,politicsNews,"December 31, 2017",54.570467,-13.980083,134.274746,-99.075305,33.005759,...,66.357857,55.300547,35.731497,-99.773088,38.575045,-183.991856,150.984416,13.029735,129.900106,Real
1,1,us military to accept transgender recruits on ...,washington reuters transgender people allowed ...,politicsNews,"December 29, 2017",16.392168,-26.120816,4.848252,-15.762792,72.401562,...,25.9766,104.826356,68.929889,-38.854926,-18.866133,-140.938915,123.375925,-17.817478,82.014783,Real
2,2,senior us republican senator let mr mueller do...,washington reuters special counsel investigati...,politicsNews,"December 31, 2017",52.821734,6.984183,41.01598,25.421412,37.767004,...,60.474485,18.700818,5.784064,-4.107978,3.144462,-108.061981,96.591288,2.033224,32.534008,Real
3,3,fbi russia probe helped by australian diplomat...,washington reuters trump campaign adviser geor...,politicsNews,"December 30, 2017",27.055829,31.024265,26.238645,43.132185,42.128654,...,58.660402,27.022198,-14.10261,13.175057,-22.10759,-115.136813,88.820179,18.092601,11.88849,Real
4,4,trump wants postal service to charge much more...,seattlewashington reuters president donald tru...,politicsNews,"December 29, 2017",102.678824,-19.267467,192.317863,49.99806,53.0654,...,21.815033,120.072916,52.685541,-114.790971,9.105391,-163.03724,178.051051,54.234891,82.207563,Real


In [3]:
%%time

from afinn import Afinn

# Initialize the AFINN lexicon
afinn = Afinn()

# Function to calculate sentiment scores for each news article
def calculate_sentiment(text):
    sentiment_score = afinn.score(text)
    return sentiment_score

# Apply the sentiment analysis function to the 'news_articles' column
Data['sentiment_score_AFINN'] = Data['text'].apply(str).apply(calculate_sentiment)

# Optional: Create sentiment labels based on the scores
def categorize_sentiment(score):
    if score < 0:
        return 'Negative'
    else:
        return 'Positive'
    
Data['sentiment_label_AFINN'] = Data['sentiment_score_AFINN'].apply(categorize_sentiment)

# Display the DataFrame with sentiment scores and labels
Data[['text', 'sentiment_score_AFINN']].head()

CPU times: total: 3min 34s
Wall time: 3min 35s


Unnamed: 0,text,sentiment_score_AFINN
0,washington reuters head conservative republica...,8.0
1,washington reuters transgender people allowed ...,10.0
2,washington reuters special counsel investigati...,-8.0
3,washington reuters trump campaign adviser geor...,-14.0
4,seattlewashington reuters president donald tru...,-14.0


In [4]:
min(Data['sentiment_score_AFINN']) , max(Data['sentiment_score_AFINN'])

(-689.0, 275.0)

In [5]:
%%time

import nltk
from nltk.corpus import sentiwordnet as swn

# Assuming you have a DataFrame 'df' with a column 'news_articles' containing text data
# Replace 'news_articles' with the actual column name in your DataFrame

# Initialize NLTK's SentiWordNet
nltk.download('sentiwordnet')

# Function to calculate sentiment scores for each news article
def calculate_sentiment(text):
    # Tokenize the text into words
    words = nltk.word_tokenize(text)
    
    # Initialize variables to store positive and negative scores
    positive_score = 0
    negative_score = 0
    
    # Calculate sentiment scores for each word
    for word in words:
        synsets = list(swn.senti_synsets(word))
        if synsets:
            # Take the first synset (most common sense)
            synset = synsets[0]
            # Add positive and negative scores
            positive_score += synset.pos_score()
            negative_score += synset.neg_score()
    
    return positive_score - negative_score

# Apply the sentiment analysis function to the 'news_articles' column
Data['sentiment_score_sentiWordNet'] = Data['text'].apply(str).apply(calculate_sentiment)

# # Optional: Create sentiment labels based on the scores
# def categorize_sentiment(score):
#     if score > 0:
#         return 'Positive'
#     elif score < 0:
#         return 'Negative'
#     else:
#         return 'Neutral'

Data['sentiment_label_sentiWordNet'] = Data['sentiment_score_sentiWordNet'].apply(categorize_sentiment)

# Display the DataFrame with sentiment scores and labels
Data[['text', 'sentiment_score_sentiWordNet']].head()

[nltk_data] Downloading package sentiwordnet to
[nltk_data]     C:\Users\jaypr\AppData\Roaming\nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!


CPU times: total: 6min 10s
Wall time: 6min 13s


Unnamed: 0,text,sentiment_score_sentiWordNet
0,washington reuters head conservative republica...,4.875
1,washington reuters transgender people allowed ...,5.75
2,washington reuters special counsel investigati...,-3.0
3,washington reuters trump campaign adviser geor...,4.0
4,seattlewashington reuters president donald tru...,2.125


In [6]:
min(Data['sentiment_score_sentiWordNet']) , max(Data['sentiment_score_sentiWordNet'])

(-29.875, 57.55599999999998)

In [7]:
%%time

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Assuming you have a DataFrame 'df' with a column 'news_articles' containing text data
# Replace 'news_articles' with the actual column name in your DataFrame

# Initialize NLTK's VADER sentiment analyzer
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()

# Function to calculate sentiment scores for each news article
def calculate_sentiment(text):
    sentiment_scores = sia.polarity_scores(text)
    # Use the compound score as the overall sentiment score
    return sentiment_scores['compound']

# Apply the sentiment analysis function to the 'news_articles' column
Data['sentiment_score_VADER'] = Data['text'].apply(str).apply(calculate_sentiment)

Data['sentiment_label_VADER'] = Data['sentiment_score_VADER'].apply(categorize_sentiment)

# Display the DataFrame with sentiment scores and labels
Data[['text', 'sentiment_score_VADER']].head()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\jaypr\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


CPU times: total: 1min 21s
Wall time: 1min 21s


Unnamed: 0,text,sentiment_score_VADER
0,washington reuters head conservative republica...,0.9831
1,washington reuters transgender people allowed ...,0.9714
2,washington reuters special counsel investigati...,0.5719
3,washington reuters trump campaign adviser geor...,-0.1761
4,seattlewashington reuters president donald tru...,0.967


In [8]:
min(Data['sentiment_score_VADER']) , max(Data['sentiment_score_VADER'])

(-1.0, 0.9999)

In [9]:
Data[['text', 'sentiment_score_AFINN', 'sentiment_score_sentiWordNet', 'sentiment_score_VADER']].head()

Unnamed: 0,text,sentiment_score_AFINN,sentiment_score_sentiWordNet,sentiment_score_VADER
0,washington reuters head conservative republica...,8.0,4.875,0.9831
1,washington reuters transgender people allowed ...,10.0,5.75,0.9714
2,washington reuters special counsel investigati...,-8.0,-3.0,0.5719
3,washington reuters trump campaign adviser geor...,-14.0,4.0,-0.1761
4,seattlewashington reuters president donald tru...,-14.0,2.125,0.967


In [10]:
Data[['text', 'sentiment_score_AFINN', 'sentiment_score_sentiWordNet', 'sentiment_score_VADER']].tail()

Unnamed: 0,text,sentiment_score_AFINN,sentiment_score_sentiWordNet,sentiment_score_VADER
44266,21st century wire says 21wire reported earlier...,-7.0,1.375,0.5571
44267,21st century wire says familiar theme whenever...,3.0,-2.375,-0.25
44268,patrick henningsen 21st century wireremember o...,-38.0,13.291,-0.9969
44269,21st century wire says al jazeera america go h...,-5.0,3.125,0.8555
44270,21st century wire says 21wire predicted new ye...,-53.0,-0.375,-0.9964


In [11]:
Data[['text', 'sentiment_label_AFINN', 'sentiment_label_sentiWordNet', 'sentiment_label_VADER']].head()

Unnamed: 0,text,sentiment_label_AFINN,sentiment_label_sentiWordNet,sentiment_label_VADER
0,washington reuters head conservative republica...,Positive,Positive,Positive
1,washington reuters transgender people allowed ...,Positive,Positive,Positive
2,washington reuters special counsel investigati...,Negative,Negative,Positive
3,washington reuters trump campaign adviser geor...,Negative,Positive,Negative
4,seattlewashington reuters president donald tru...,Negative,Positive,Positive


In [12]:
Data[['text', 'sentiment_label_AFINN', 'sentiment_label_sentiWordNet', 'sentiment_label_VADER']].tail()

Unnamed: 0,text,sentiment_label_AFINN,sentiment_label_sentiWordNet,sentiment_label_VADER
44266,21st century wire says 21wire reported earlier...,Negative,Positive,Positive
44267,21st century wire says familiar theme whenever...,Positive,Negative,Negative
44268,patrick henningsen 21st century wireremember o...,Negative,Positive,Negative
44269,21st century wire says al jazeera america go h...,Negative,Positive,Positive
44270,21st century wire says 21wire predicted new ye...,Negative,Negative,Negative


In [14]:
from scipy import stats

# Define a function to calculate the most frequent label
def calculate_most_frequent_label(row):
    labels = [
        row['sentiment_label_AFINN'],
        row['sentiment_label_sentiWordNet'],
        row['sentiment_label_VADER']
    ]
    # Use the max function with the key parameter to get the most frequent label
    most_frequent_label = max(set(labels), key=labels.count)
    return most_frequent_label

# Apply the calculate_most_frequent_label function to each row to create the 'sentiment_label_overall' column
Data['sentiment_label_overall'] = Data.apply(calculate_most_frequent_label, axis=1)

In [15]:
# Display the DataFrame with sentiment labels
Data[['text', 'sentiment_label_AFINN', 'sentiment_label_sentiWordNet', 'sentiment_label_VADER', 'sentiment_label_overall']].head()

Unnamed: 0,text,sentiment_label_AFINN,sentiment_label_sentiWordNet,sentiment_label_VADER,sentiment_label_overall
0,washington reuters head conservative republica...,Positive,Positive,Positive,Positive
1,washington reuters transgender people allowed ...,Positive,Positive,Positive,Positive
2,washington reuters special counsel investigati...,Negative,Negative,Positive,Negative
3,washington reuters trump campaign adviser geor...,Negative,Positive,Negative,Negative
4,seattlewashington reuters president donald tru...,Negative,Positive,Positive,Positive


In [16]:
# Display the DataFrame with sentiment labels
Data[['text', 'sentiment_label_AFINN', 'sentiment_label_sentiWordNet', 'sentiment_label_VADER', 'sentiment_label_overall']].tail()

Unnamed: 0,text,sentiment_label_AFINN,sentiment_label_sentiWordNet,sentiment_label_VADER,sentiment_label_overall
44266,21st century wire says 21wire reported earlier...,Negative,Positive,Positive,Positive
44267,21st century wire says familiar theme whenever...,Positive,Negative,Negative,Negative
44268,patrick henningsen 21st century wireremember o...,Negative,Positive,Negative,Negative
44269,21st century wire says al jazeera america go h...,Negative,Positive,Positive,Positive
44270,21st century wire says 21wire predicted new ye...,Negative,Negative,Negative,Negative


In [17]:
Real_Data = pd.read_csv("Datasets/ISOT Fake News Dataset/True.csv")
Real_Data.shape

(21417, 4)

In [18]:
Real_Data['text'][0]      # Positive  

'WASHINGTON (Reuters) - The head of a conservative Republican faction in the U.S. Congress, who voted this month for a huge expansion of the national debt to pay for tax cuts, called himself a “fiscal conservative” on Sunday and urged budget restraint in 2018. In keeping with a sharp pivot under way among Republicans, U.S. Representative Mark Meadows, speaking on CBS’ “Face the Nation,” drew a hard line on federal spending, which lawmakers are bracing to do battle over in January. When they return from the holidays on Wednesday, lawmakers will begin trying to pass a federal budget in a fight likely to be linked to other issues, such as immigration policy, even as the November congressional election campaigns approach in which Republicans will seek to keep control of Congress. President Donald Trump and his Republicans want a big budget increase in military spending, while Democrats also want proportional increases for non-defense “discretionary” spending on programs that support educat

In [19]:
Real_Data['text'][1]       # Negative for LGBTQ

'WASHINGTON (Reuters) - Transgender people will be allowed for the first time to enlist in the U.S. military starting on Monday as ordered by federal courts, the Pentagon said on Friday, after President Donald Trump’s administration decided not to appeal rulings that blocked his transgender ban. Two federal appeals courts, one in Washington and one in Virginia, last week rejected the administration’s request to put on hold orders by lower court judges requiring the military to begin accepting transgender recruits on Jan. 1. A Justice Department official said the administration will not challenge those rulings. “The Department of Defense has announced that it will be releasing an independent study of these issues in the coming weeks. So rather than litigate this interim appeal before that occurs, the administration has decided to wait for DOD’s study and will continue to defend the president’s lawful authority in District Court in the meantime,” the official said, speaking on condition 

In [20]:
Real_Data['text'][2]  # Negative who are support Trump, vise versa   # negative

'WASHINGTON (Reuters) - The special counsel investigation of links between Russia and President Trump’s 2016 election campaign should continue without interference in 2018, despite calls from some Trump administration allies and Republican lawmakers to shut it down, a prominent Republican senator said on Sunday. Lindsey Graham, who serves on the Senate armed forces and judiciary committees, said Department of Justice Special Counsel Robert Mueller needs to carry on with his Russia investigation without political interference. “This investigation will go forward. It will be an investigation conducted without political influence,” Graham said on CBS’s Face the Nation news program. “And we all need to let Mr. Mueller do his job. I think he’s the right guy at the right time.”  The question of how Russia may have interfered in the election, and how Trump’s campaign may have had links with or co-ordinated any such effort, has loomed over the White House since Trump took office in January. It

In [21]:
Real_Data['text'][3]          # Negative

'WASHINGTON (Reuters) - Trump campaign adviser George Papadopoulos told an Australian diplomat in May 2016 that Russia had political dirt on Democratic presidential candidate Hillary Clinton, the New York Times reported on Saturday. The conversation between Papadopoulos and the diplomat, Alexander Downer, in London was a driving factor behind the FBI’s decision to open a counter-intelligence investigation of Moscow’s contacts with the Trump campaign, the Times reported. Two months after the meeting, Australian officials passed the information that came from Papadopoulos to their American counterparts when leaked Democratic emails began appearing online, according to the newspaper, which cited four current and former U.S. and foreign officials. Besides the information from the Australians, the probe by the Federal Bureau of Investigation was also propelled by intelligence from other friendly governments, including the British and Dutch, the Times said. Papadopoulos, a Chicago-based inte

In [22]:
Real_Data['text'][4]     # Neutral



In [23]:
Fake_Data = pd.read_csv("Datasets/ISOT Fake News Dataset/Fake.csv")
Fake_Data.shape

(23481, 4)

In [24]:
Fake_Data['text'][23476]      # positive

'21st Century Wire says As 21WIRE reported earlier this week, the unlikely  mishap  of two US Naval vessels straying into Iranian waters   just hours before the President s State of the Union speech, followed by the usual parade of arch-neocons coming on TV in real time to declare the incident as  an act of aggression  by Iran against the United States   is no mere coincidence.24 hours after the incident, the Iranians returned all 11 US sailors, unharmed and in good spirits. The only remaining casualty from this event was an incident of a common condition in Washington known as  Pre-Traumatic Stress Disorder    suffered by a certain US Senator was mortified by the uneventful outcome which followed Daniel McAdams Ron Paul Institute  The two US Navy riverine command boats intercepted in Iranian territorial waters yesterday were sent on their way along with the crew of 10 US sailors after brief detention on Iranian soil.According to news reports, the well-armed warships either suffered me

In [25]:
Fake_Data['text'][23477]      # Negative

'21st Century Wire says It s a familiar theme. Whenever there is a dispute or a change of law, and two tribes go to war, there is normally only one real winner after the tribulation  the lawyers. Ars TechnicaIn late 2013, Yahoo was hit with six lawsuits over its practice of using automated scans of e-mail to produce targeted ads. The cases, which were consolidated in federal court, all argued that the privacy rights of non-Yahoo users, who  did not consent to Yahoo s interception and scanning of their emails,  were being violated by a multi-billion dollar company.Now, lawyers representing the plaintiffs are singing a different tune. Last week, they asked US District Judge Lucy Koh to accept a proposed settlement (PDF). Under the proposal, the massive class of non-Yahoo users won t get any payment, but the class lawyers at Girard Gibbs and Kaplan Fox intend to ask for up to $4 million in fees. (The ultimate amount of fees will be up to the judge, but Yahoo has agreed not to oppose any f

In [26]:
Fake_Data['text'][23478]    # Negative

'Patrick Henningsen  21st Century WireRemember when the Obama Administration told the world how it hoped to identify 5,000 reliable non-jihadist  moderate  rebels hanging out in Turkey and Jordan, who might want to fight for Washington in Syria? After all the drama over its infamous  train and equip  program to create their own Arab army in Syria, they want to give it another try.This week, Pentagon officials announced their new plan to train up to 7,000 more  moderate  fighters, but this time the project would take place inside Syria (and to hell with international law).We re told that this was requested by Ankara, and with all NATO allies singing the same hymn   claiming that this new effort will help in securing Turkey s porous border with Syria, or so the story goes. Washington s political cover for this is fashioned from the popular post-Paris theme: to protect civilized Europe from invading hordes and the terrorists who hide among them, as stated in the Wall Street Journal: The p

In [27]:
Fake_Data['text'][23479]       # Negative   # Positive for that news channel any publicity is good publicity

'21st Century Wire says Al Jazeera America will go down in history as one of the biggest failures in broadcast media history.Ever since the US and its allies began plotting to overthrow Libya and Syria, Al Jazeera has deteriorated from a promising international news network in 2003   into what it has become in 2016   a full-blown agit prop media shop for the US State Department and the Pentagon. In fact, US Military Central Command (CENTCOM) for the Middle East region is located in Qatar s capital of Doha, conveniently located just down the road from Al Jazeera s headquarters. Nice and cozy.Al Jazeera s role in promoting  regime change  and the destablization of Syria is now recorded history, as are its many shameful exhibitions of media fakery.The Qatari royals tried to expand propaganda operations into North America, but no matter how much money they threw at it (and those royals just love throwing money at things to try and get them to work), no one cared about it, much less watched

In [28]:
Fake_Data['text'][23480]    # Neutral # Negative from prespective of americans

'21st Century Wire says As 21WIRE predicted in its new year s look ahead, we have a new  hostage  crisis underway.Today, Iranian military forces report that two small riverine U.S. Navy boats were seized in Iranian waters, and are currently being held on Iran s Farsi Island in the Persian Gulf. A total of 10 U.S. Navy personnel, nine men and one woman, have been detained by Iranian authorities. NAVY STRAYED: U.S. Navy patrol boat in the Persian Gulf (Image Source: USNI)According to the Pentagon, the initial narrative is as follows: The sailors were on a training mission around noon ET when their boat experienced mechanical difficulty and drifted into Iranian-claimed waters and were detained by the Iranian Coast Guard, officials added. The story has since been slightly revised by White House spokesman Josh Earnest to follow this narrative:The 2 boats were traveling en route from Kuwait to Bahrain, when they were stopped and detained by the Iranians.According to USNI, search and rescue t