In [1]:
import pandas as pd

df1 = pd.read_csv('capstone_data_new.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'capstone_data_new.csv'

In [2]:
df1.head()

NameError: name 'df1' is not defined

In [None]:
df1.shape

### Segregating into short and long texts

In [None]:
df1[df1.length<280]['label'].value_counts()

In [3]:
df1[df1.length>280]['label'].value_counts()

NameError: name 'df1' is not defined

In [None]:
df_short = df1[df1.length<280][['text', "label"]]
df_long = df1[df1.length>280][['text', "label"]]

### Working with short texts

In [None]:
df_short.isnull().sum()

In [None]:
df_short.dropna(inplace=True)

In [None]:
df_short.shape

We need to represent data as numeric values for the model. We need to do something similar for the textual information from the *text* column, but as this is dependent of the model architecture, this is done in the subsequent notebook.

#### Real is 1 while Fake is 0.

In [4]:
from sklearn.preprocessing import LabelEncoder

enc = LabelEncoder()
df_short['label'] = enc.fit_transform(df_short['label'])

NameError: name 'df_short' is not defined

In [None]:
df_short.head()

In [None]:
df_short.iloc[4]['text']

In [None]:
df_short.text.sample(10, random_state=1).to_list()

## Data Cleaning

1. Removed URLs
2. Removed User Mentions
3. Removed all numbers
4. Removed punctuations and extra spaces

In [None]:
import string
import re

df_short['text'] = df_short['text'].apply(lambda x:re.sub(r'http\S+', '', x))
df_short['text'] = df_short['text'].apply(lambda x:re.sub(r'@\S+ ', '', x))
df_short['text'] = df_short['text'].apply(lambda x:''.join(i for i in x if not i.isdigit()))
table = str.maketrans(string.punctuation, ' '*len(string.punctuation))
df_short['text'] = df_short['text'].str.translate(table)
df_short['text'] = df_short['text'].str.replace(' +', ' ')
df_short['text'] = df_short['text'].str.lower()
df_short['text'] = df_short['text'].str.strip()

In [None]:
df_short.text.sample(10, random_state=1).to_list()

### Doubts
* keep only english tweets? translate the non-eng ones to eng?

In [None]:
#Train test split
from sklearn.model_selection import train_test_split

X = df_short.text
y = df_short.label

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1, stratify=y)

In [None]:
df_short_train = pd.concat([X_train, y_train], axis=1)
df_short_test = pd.concat([X_test, y_test], axis=1)

In [None]:
# df_short_test = df_short[~df_short.index.isin(df_short_train.index)][df_short.label==1]
# df_short_test = pd.concat([df_short_test, df_short[~df_short.index.isin(df_short_train.index)][df_short.label==0].sample(n = df_short_test.shape[0], random_state=1)])

In [None]:
lengths = [len(df_short_train.iloc[i]['text'].split()) for i in range(len(df_short_train))]
print(max(lengths))
print(min(lengths))

In [None]:
import matplotlib.pyplot as plt

plt.hist(lengths)

In [None]:
import numpy as np
np.median(lengths)

In [None]:
df_short_train = df_short_train[[l > 0 for l in lengths]]

In [None]:
df_short_train.shape

In [None]:
pd.DataFrame(df_short_train['label'].value_counts())

We then save the preprocessed dataset, and another one corresponding to a 10% sample.

In [None]:
df_short_train.isnull().sum()

In [None]:
df_short_train.to_csv('short_text_preprocessed_train.csv', index=False)
df_short_test.to_csv('short_text_preprocessed_test.csv', index=False)

In [None]:
df_short_train.shape

In [None]:
# df_short.sample(n=int(len(df_short)*0.1), random_state=111).to_csv('short_text_sampled.csv', index=False)

## Working with long texts

In [None]:
df_long.head()

In [None]:
enc = LabelEncoder()
df_long['label'] = enc.fit_transform(df_long['label'])

In [None]:
df_long.head()

In [None]:
df_long.text.sample(1, random_state=1).to_list()

In [None]:
df_long['text'] = df_long['text'].apply(lambda x:re.sub(r'http\S+', '', x))
df_long['text'] = df_long['text'].apply(lambda x:re.sub(r'@\S+ ', '', x))
df_long['text'] = df_long['text'].apply(lambda x:''.join(i for i in x if not i.isdigit()))
table = str.maketrans(string.punctuation, ' '*len(string.punctuation))
df_long['text'] = df_long['text'].str.translate(table)
df_long['text'] = df_long['text'].str.replace(' +', ' ')
df_long['text'] = df_long['text'].str.lower()
df_long['text'] = df_long['text'].str.strip()

In [None]:
df_long.text.sample(1, random_state=1).to_list()

In [None]:
#Train test split

X = df_long.text
y = df_long.label

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1, stratify=y)

df_long_train = pd.concat([X_train, y_train], axis=1)
df_long_test = pd.concat([X_test, y_test], axis=1)

In [None]:
lengths = [len(df_long_train.iloc[i]['text'].split()) for i in range(len(df_long_train))]
print(max(lengths))
print(min(lengths))

In [None]:
plt.hist([l for l in lengths if l<5000])

In [None]:
np.median(lengths)

In [None]:
df_long_train.shape

In [None]:
df_long_train = df_long_train[[l < 1000 for l in lengths]]

In [None]:
df_long_train.shape

In [None]:
df_long_train.label.value_counts()

In [None]:
df_long_train.to_csv('long_text_preprocessed_train.csv', index=False)
df_long_test.to_csv('long_text_preprocessed_test.csv', index=False)

In [None]:
import pandas as pd

In [None]:
categories = ['ClaimFakeCOVID-19_tweets', 'ClaimRealCOVID-19_tweets', 'NewsFakeCOVID-19_tweets', 'NewsRealCOVID-19_tweets']
filenames = ['politifact_fake', 'politifact_real', 'gossipcop_fake', 'gossipcop_real']

for category, filename in zip(categories, filenames):
    print("Working on category:", category)
    tweet_ids = []
    for i, year in enumerate(['05-01-2020', '07-01-2020', '09-01-2020', '11-01-2020']):
        if category+'.csv' in os.listdir(f'../CoAID/{year}'):
            df = pd.read_csv(f'../CoAID/{year}/{category}.csv')
            tweet_ids.extend(df.tweet_id.to_list())

    final = pd.DataFrame({'id':[category], 'news_url':[''], 'title':[''], 'tweet_ids':'\t'.join(map(str, tweet_ids))})
    print("Saving it to filename: {}.csv".format(filename))
    final.to_csv(f'../CoAID/{filename}.csv', index=False)

In [None]:
import json

PATH = f'..\\FakeNewsNet\\code\\fakenewsnet_dataset\\'

real_tweets = set()
fake_tweets = set()

for status in ['real', 'fake']:
    for category in ['gossipcop', 'politifact']:
        source = os.listdir(PATH+f'{category}\\{status}\\')[0]
        DIR = PATH+f"{category}\\{status}\\{source}\\tweets\\"
        print(DIR)
        print(len(set(os.listdir(DIR))))
        for file in os.listdir(DIR):
            with open(DIR+file) as f:
                text = f.read()
                t = json.loads(text)
                if status=='real':
                    real_tweets.add(t.get('text'))
                else:
                    fake_tweets.add(t.get('text'))
        print("Counts for real and fake tweets:", (len(real_tweets), len(fake_tweets)))

In [None]:
df_final = pd.DataFrame({'text':list(real_tweets)+list(fake_tweets), 'label':([1]*len(real_tweets)) + ([0]*len(fake_tweets))})

In [None]:
import string
import re

df_final['text'] = df_final['text'].apply(lambda x:re.sub(r'http\S+', '', x))
df_final['text'] = df_final['text'].apply(lambda x:re.sub(r'@\S+ ', '', x))
df_final['text'] = df_final['text'].apply(lambda x:''.join(i for i in x if not i.isdigit()))
table = str.maketrans(string.punctuation, ' '*len(string.punctuation))
df_final['text'] = df_final['text'].str.translate(table)
df_final['text'] = df_final['text'].str.replace(' +', ' ')
df_final['text'] = df_final['text'].str.lower()
df_final['text'] = df_final['text'].str.strip()

In [None]:
df_final.head()

In [None]:
df_final.isnull().sum()

In [None]:
df_final.shape

In [None]:
import os
p = 'tmp_dir/another_dir/saved_df.csv'
print(os.path.join(*p.split('/')[:2]))
os.makedirs(os.path.join(*p.split('/')[:2]), exist_ok=True)
df_final.to_csv(p, index=False)

In [None]:
df_final.label.value_counts()

In [None]:
pd.read_csv('data/shorttextpreprocessedtrain.csv').shape

In [None]:
df_final_final = df_final.append(pd.read_csv('data/shorttextpreprocessedtrain.csv'))

In [None]:
df_final_final = df_final_final.sample(frac = 1)

In [None]:
df_final_final.to_csv('data/newdatasetwithcoviddata.csv', index=False)

In [None]:
df_final_final = pd.read_csv('data/newdatasetwithcoviddata.csv').dropna()
df_final_final.to_csv('data/newdatasetwithcoviddata.csv', index=False)

In [None]:
pd.read_csv('data/newdatasetwithcoviddata.csv').isnull().sum()

### Trying out multiprocessing

In [None]:
# import pandas as pd
# import multiprocessing as mp
# import time
# import re
# from nltk.corpus import stopwords
# import string

# t = str.maketrans(dict.fromkeys(string.punctuation))

# def clean_text(text):
#     # Remove stop words
#     stops = set(stopwords.words("english"))
#     text = " ".join(list(set(text.lower().split()) - stops))
#     # Remove Special Characters
#     text = text.translate(t)
#     # removing the extra spaces
#     text = re.sub(' +',' ', text)
#     return text

# df = pd.read_csv("src/Blob_04_05_2021/request_2667/fileblock_0.csv") # file loading
# print("Columns of the dataset", list(df.columns))
# print("Total records of the dataset", len(df))

# # Before Parallel Processing
# df1 = df.copy()
# t1 = time.time()
# df1['tweet'] = df1['tweet'].apply(clean_text)
# t2 = time.time()
# print("time consuming before Parallel Processing to process the Dataset {0:.2f}s".format(round(t2-t1, 2)))

# # After Parallel Processing
# p = mp.Pool(mp.cpu_count()-1) # Data parallelism Object
# df2 = df.copy()
# t3 = time.time()
# df2['tweet'] = p.map(clean_text, df2['tweet'])
# t4 = time.time()

# print("time consuming after Parallel Processing to process the Dataset {0:.2f}s".format(round(t4-t3, 2)))