In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.templates
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


In [3]:
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
import nltk
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [5]:
df=pd.read_csv("/content/FA-KES-Dataset.csv", encoding='latin-1')
df.head()


Unnamed: 0,unit_id,article_title,article_content,source,date,location,labels
0,1914947530,Syria attack symptoms consistent with nerve ag...,Wed 05 Apr 2017 Syria attack symptoms consiste...,nna,4/5/2017,idlib,0
1,1914947532,Homs governor says U.S. attack caused deaths b...,Fri 07 Apr 2017 at 0914 Homs governor says U.S...,nna,4/7/2017,homs,0
2,1914947533,Death toll from Aleppo bomb attack at least 112,Sun 16 Apr 2017 Death toll from Aleppo bomb at...,nna,4/16/2017,aleppo,0
3,1914947534,Aleppo bomb blast kills six Syrian state TV,Wed 19 Apr 2017 Aleppo bomb blast kills six Sy...,nna,4/19/2017,aleppo,0
4,1914947535,29 Syria Rebels Dead in Fighting for Key Alepp...,Sun 10 Jul 2016 29 Syria Rebels Dead in Fighti...,nna,7/10/2016,aleppo,0


In [6]:
df =df.drop(['date'],axis=1)

In [7]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [8]:
df.info()
df.shape
df.dtypes
df.describe()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 804 entries, 0 to 803
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   unit_id          804 non-null    int64 
 1   article_title    804 non-null    object
 2   article_content  804 non-null    object
 3   source           804 non-null    object
 4   location         804 non-null    object
 5   labels           804 non-null    int64 
dtypes: int64(2), object(4)
memory usage: 37.8+ KB


Unnamed: 0,unit_id,labels
count,804.0,804.0
mean,1936024000.0,0.529851
std,18769680.0,0.499419
min,1914948000.0,0.0
25%,1923848000.0,0.0
50%,1924058000.0,1.0
75%,1962496000.0,1.0
max,1965511000.0,1.0


In [9]:
df.isnull().sum()

unit_id            0
article_title      0
article_content    0
source             0
location           0
labels             0
dtype: int64

In [10]:
df['labels'].value_counts()

1    426
0    378
Name: labels, dtype: int64

In [11]:
df['content'] =df['article_title']+''+df['article_content']

In [12]:
print(df['content'])

0      Syria attack symptoms consistent with nerve ag...
1      Homs governor says U.S. attack caused deaths b...
2      Death toll from Aleppo bomb attack at least 11...
3      Aleppo bomb blast kills six Syrian state TVWed...
4      29 Syria Rebels Dead in Fighting for Key Alepp...
                             ...                        
799    Turkish Bombardment Kills 20 Civilians in Syri...
800    Martyrs as Terrorists Shell Aleppos Salah Eddi...
801    Chemical Attack Kills Five Syrians in Aleppo S...
802    5 Killed as Russian Military Chopper Shot down...
803    Syrian Army Kills 48 ISIL Terrorists in Deir E...
Name: content, Length: 804, dtype: object


In [13]:
#X= df.drop(columns='labels',axis=1)
#y= df['labels']

In [14]:
port_stem = PorterStemmer()

In [15]:
def stemming(content):
  stemmed_content = re.sub('[^a-zA-Z]',' ', content)
  stemmed_content = stemmed_content.lower()
  stemmed_content = stemmed_content.split()
  stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
  stemmed_content = ' '.join(stemmed_content)
  return stemmed_content





In [16]:
df['content'] = df['content'].apply(stemming)

In [17]:

print(df['content'])

0      syria attack symptom consist nerv agent use wh...
1      hom governor say u attack caus death doesnt se...
2      death toll aleppo bomb attack least sun apr de...
3      aleppo bomb blast kill six syrian state tvwed ...
4      syria rebel dead fight key aleppo roadsun jul ...
                             ...                        
799    turkish bombard kill civilian syria turkish bo...
800    martyr terrorist shell aleppo salah eddin mart...
801    chemic attack kill five syrian aleppo sana che...
802    kill russian militari chopper shot syria kill ...
803    syrian armi kill isil terrorist deir ezzorapri...
Name: content, Length: 804, dtype: object


In [18]:
X= df['content'].values
y= df['labels'].values

In [19]:
#convert text to numerical data

In [20]:
vectorizer =TfidfVectorizer()
vectorizer.fit(X)

X=vectorizer.transform(X)

In [21]:
print(X)

  (0, 7135)	0.026566933856363925
  (0, 7084)	0.03382921606540499
  (0, 7082)	0.039508398335727904
  (0, 7041)	0.07964306509932986
  (0, 7004)	0.029698869507181584
  (0, 6995)	0.12023172689300601
  (0, 6979)	0.04499235578036738
  (0, 6906)	0.03644535665491017
  (0, 6897)	0.05925046238159618
  (0, 6854)	0.18942461469264557
  (0, 6793)	0.05378738599271968
  (0, 6711)	0.09921417456350605
  (0, 6687)	0.031208539817178083
  (0, 6637)	0.09547420063526672
  (0, 6559)	0.02775415260760836
  (0, 6344)	0.0526090976574691
  (0, 6336)	0.099198032104018
  (0, 6333)	0.07964306509932986
  (0, 6332)	0.22114507820830376
  (0, 6303)	0.04278260244251628
  (0, 6237)	0.040021928821782626
  (0, 6210)	0.036192427839756375
  (0, 6153)	0.07964306509932986
  (0, 6131)	0.06479743445545294
  (0, 6128)	0.02115743174429768
  :	:
  (803, 1474)	0.07612480628280724
  (803, 1410)	0.12319004911433935
  (803, 1398)	0.03324609444933642
  (803, 1377)	0.09406786983709502
  (803, 1360)	0.0436722285334118
  (803, 1174)	0.031227

In [22]:
X_train,X_test,y_train,y_test=train_test_split(X, y, test_size=0.7,stratify=y, random_state=3)

In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Create the logistic regression model
model = LogisticRegression()

# Fit the model to the training data
model.fit(X_train, y_train)

# Make predictions on the training data
prediction = model.predict(X_train)

# Calculate accuracy on the training data
accuracy = accuracy_score(prediction, y_train)

print('Accuracy:', accuracy)

Accuracy: 0.9377593360995851


In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Create the logistic regression model
model = LogisticRegression()

# Fit the model to the training data
model.fit(X_test, y_test)

# Make predictions on the training data
prediction = model.predict(X_test)

# Calculate accuracy on the training data
accuracy = accuracy_score(prediction, y_test)

print('Accuracy:', accuracy)

Accuracy: 0.8898756660746003


In [27]:
input=X_test[0]

prediction=model.predict(input)
print(prediction)

[0]
