### **STEP 1: Importing data and neccessary libraries**


In [5]:
import pandas as pd  # Importing the pandas library for data manipulation and analysis
from nltk.corpus import stopwords  # Importing the stopwords list from the NLTK library
from nltk.stem.porter import PorterStemmer  # Importing the PorterStemmer for stemming words
import re  # Importing the re module for regular expression operations
from sklearn.feature_extraction.text import TfidfVectorizer  # Importing TfidfVectorizer for converting text data to TF-IDF features
from sklearn.model_selection import train_test_split  # Importing train_test_split for splitting data into training and testing sets
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
#Importing the dataset
df=pd.read_csv("/content/train.csv")

In [7]:
#Displaying the first 5 rows of the DataFrame
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


### **STEP2: EDA**

In [8]:
# Displaying a concise summary of the DataFrame
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20800 entries, 0 to 20799
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      20800 non-null  int64 
 1   title   20242 non-null  object
 2   author  18843 non-null  object
 3   text    20761 non-null  object
 4   label   20800 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 812.6+ KB


In [9]:
df.shape

(20800, 5)

There are 20800 rows and 5 columns in the dataaset

In [10]:
# Calculating the number of missing values in each column
df.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [11]:
#Replacing all missing values with an empty string
df = df.fillna('')

In [12]:
df.isnull().sum()

id        0
title     0
author    0
text      0
label     0
dtype: int64

In [13]:
#column labels of the DataFrame
df.columns

Index(['id', 'title', 'author', 'text', 'label'], dtype='object')

In [14]:
# Drop the unnessary columns
df=df.drop(['id','title','author'],axis=1)

In [15]:
#Displaying the first 5 rows of the DataFrame
df.head()

Unnamed: 0,text,label
0,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,Ever get the feeling your life circles the rou...,0
2,"Why the Truth Might Get You Fired October 29, ...",1
3,Videos 15 Civilians Killed In Single US Airstr...,1
4,Print \nAn Iranian woman has been sentenced to...,1


### **STEP 3: DATA PRE-PROCESSING**

In [16]:
# Creating an instance of the PorterStemmer for stemming words
port_stem=PorterStemmer()

In [17]:
port_stem

<PorterStemmer>

In [18]:
def stemming(content):  # Defining a function named 'stemming' that takes 'content' as an argument
    con = re.sub('[^a-zA-Z]', ' ', content)  # Using regular expressions to replace non-alphabetic characters with spaces
    con = con.lower()  # Converting the content to lowercase
    con = con.split()  # Splitting the content into individual words
    con = [port_stem.stem(word) for word in con if not word in stopwords.words('english')]  # Stemming each word and removing stopwords
    con = ' '.join(con)  # Joining the list of words back into a single string
    return con  # Returning the processed content


In [19]:
stemming("Hi this is Harshitha")

'hi harshitha'

In [20]:
# Applying the function to text column
df['text']=df['text'].apply(stemming)

#### **STEP 4: Train and Test Data Splitting**

In [21]:
# Extract the 'text' column from the DataFrame and assign it to the variable X
X=df['text']
# Extract the 'label' column from the DataFrame and assign it to the variable y
y=df['label']

In [27]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

In [28]:
vect=TfidfVectorizer()

In [29]:
X_train = vect.fit_transform(X_train)
X_test = vect.transform(X_test)

### **STEP 5:Model Building**

In [30]:
from sklearn.tree import DecisionTreeClassifier

model=DecisionTreeClassifier()

In [31]:
 # Train the model using the training data (X_train) and training labels (y_train)
model.fit(X_train,y_train)

In [32]:
prediction=model.predict(X_test)

In [35]:
# Evaluate the model using the test data and test labels
model_score = model.score(X_test, y_test)  # Calculate the accuracy of the model on the test set

# Print the model's score
print(model_score)  # Output the accuracy score

0.8795673076923077


In [36]:
import pickle
pickle.dump(vect, open('vector.pkl', 'wb'))
pickle.dump(model, open('model.pkl', 'wb'))
vector_form=pickle.load(open('vector.pkl', 'rb'))
load_model=pickle.load(open('model.pkl', 'rb'))

In [52]:
def fake_news(news):
    news=stemming(news)
    input_data=[news]
    vector_form1=vector_form.transform(input_data)
    prediction = load_model.predict(vector_form1)
    return prediction

In [53]:
val=fake_news("""Scientists have discovered a new species of unicorns living in the depths of the Amazon rainforest. These unicorns are said to have the ability to fly and shoot rainbows from their horns. Researchers claim that the discovery of these magical creatures will revolutionize our understanding of biology and the natural world."

Feel free to use this piece of fake news to test your fake news detection algorithm or any other related task. Let me know if you need further assistance! """)

In [54]:
if val==[0]:
    print('reliable')
else:
    print('unreliable')

reliable
