In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.pylab as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, confusion_matrix, f1_score, r2_score
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neural_network import MLPRegressor, MLPClassifier

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import *

from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import *

import time

import re
import csv
import string

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

import itertools

nltk.download('punkt')
nltk.download('stopwords')
stemmer = SnowballStemmer("english")

# Reading 1 million samples
url_1mil = '1mil_23032023.csv'  
df_1mil = pd.read_csv(url_1mil, usecols=['domain', 'type', 'content'])

In [None]:
def clean_text(text):
    text = text.replace('\n', '')
    text = text.replace('  ', '')
    text = text.lower()

    text = re.sub(r'(\´)|(\`)|(\')|(\")|(\“)|(\”)', '', text)
    replace = re.sub("http\S+|www\S+", '<URL>', text)
    replace = re.sub(r'\S+@+\S+\.+\S', 'EMAIL', replace)
    replace = re.sub(r'\S+\.com\S', 'URL', replace)
    replace = re.sub(r'(\d{4}/\d{2}/\d{2} \d{2}\:\d{2}\:\d{2}\.\d)|(\d{4}-\d{2}-\d{2} \d{2}\:\d{2}\:\d{2}\.\d)', 'DATE', replace)
    replace = re.sub(r'\d{2}\:\d{2}\:\d{2}\.\d', 'TIME', replace)
    replace = re.sub(r'\d+,?\.?\d*\.?\d*', 'NUM', replace)
    replace = re.sub(r'(\-)|(\—)', '', replace)
    
    # remove punctuation
    nopunc = [char for char in replace if char not in string.punctuation] # !"#$%&'()*+, -./:;<=>?@[\]^_`{|}~
    nopunc = ''.join(nopunc)
    
    # remove stopwords and return to list
    clean_words = [word for word in nopunc.split() if word not in stopwords.words('english')]
    cleaned = [stemmer.stem(word) for word in clean_words]
    return cleaned

def sorting_df(df):
    df.dropna(axis=0, inplace=True) # Fjern NaN
    df.drop(df[df['type'] == 'unknown'].index, inplace = True) # drop unknown
    df.drop(df[df['type'] == 'rumor'].index, inplace = True) # fjerner fordi den ikke er på Github-listen
    df.drop_duplicates(subset=['content'], keep='first', inplace=True) # fjerner duplikater

    fake_group = ['fake','satire','bias','conspiracy','junksci','hate','unreliable']
    df['type_binary'] = df['type'].isin(fake_group)
    df['type_binary'] = df['type_binary'].astype(int) # define types
    return df

# Testing clean_text on 250 samples
def process(dataframe):
    df = sorting_df(dataframe)
    df['content'] = df['content'].apply(clean_text)
    return df

In [None]:
# Split into chunks of 100K.

df_1 = df_1mil.loc[0:100000]
df_2 = df_1mil.loc[100001:200000]
df_3 = df_1mil.loc[200001:300000]
df_4 = df_1mil.loc[300001:400000]
df_5 = df_1mil.loc[400001:500000]
df_6 = df_1mil.loc[500001:600000]
df_7 = df_1mil.loc[600001:700000]
df_8 = df_1mil.loc[700001:800000]
df_9 = df_1mil.loc[800001:900000]
df_10 = df_1mil.loc[900001:]

In [None]:
df_1 = process(df_1)
df_1.to_csv('1mill_1_cleaned.csv')

In [None]:
df_2 = process(df_2)
df_2.to_csv('1mill_2_cleaned.csv')

In [None]:
df_3 = process(df_3)
df_3.to_csv('1mill_3_cleaned.csv')

In [None]:
df_4 = process(df_4)
df_4.to_csv('1mill_4_cleaned.csv')

In [None]:
df_5 = process(df_5)
df_5.to_csv('1mill_5_cleaned.csv')

In [None]:
df_6 = process(df_6)
df_6.to_csv('1mill_6_cleaned.csv')

In [None]:
df_7 = process(df_7)
df_7.to_csv('1mill_7_cleaned.csv')

In [None]:
df_8 = process(df_8)
df_8.to_csv('1mill_8_cleaned.csv')

In [None]:
df_9 = process(df_9)
df_9.to_csv('1mill_9_cleaned.csv')

In [None]:
df_10 = process(df_10)
df_10.to_csv('1mill_10_cleaned.csv')

In [None]:
df_full_cleaned = pd.concat([df_1,df_2,df_3,df_4,df_5,df_6,df_7,df_8,df_9,df_10])
df_full_cleaned.to_csv('1mill_cleaned.csv')