In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.pylab as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, confusion_matrix, f1_score, r2_score
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neural_network import MLPRegressor, MLPClassifier

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import *

from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import *

import time

import re
import csv
import string

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

import itertools

nltk.download('punkt')
nltk.download('stopwords')
stemmer = SnowballStemmer("english")


df_2mil = pd.read_csv('news_cleaned_2018_02_13.csv', skiprows=range(1,1000000), nrows=1000000, usecols=['domain','type','content'])

[nltk_data] Downloading package punkt to /Users/tove/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/tove/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
def clean_text(text):
    text = text.replace('\n', '')
    text = text.replace('  ', '')
    text = text.lower()

    text = re.sub(r'(\´)|(\`)|(\')|(\")|(\“)|(\”)', '', text)
    
    replace = re.sub("http\S+|www\S+", '<URL>', text)
    replace = re.sub(r'\S+@+\S+\.+\S', 'EMAIL', replace)
    replace = re.sub(r'\S+\.com\S', 'URL', replace)
    replace = re.sub(r'(\d{4}/\d{2}/\d{2} \d{2}\:\d{2}\:\d{2}\.\d)|(\d{4}-\d{2}-\d{2} \d{2}\:\d{2}\:\d{2}\.\d)', 'DATE', replace)
    replace = re.sub(r'\d{2}\:\d{2}\:\d{2}\.\d', 'TIME', replace)
    replace = re.sub(r'\d+,?\.?\d*\.?\d*', 'NUM', replace)
    replace = re.sub(r'(\-)|(\—)', '', replace)
    
    # remove punctuation
    nopunc = [char for char in replace if char not in string.punctuation] 
    nopunc = ''.join(nopunc)
    
    # remove stopwords and return to list
    clean_words = [word for word in nopunc.split() if word not in stopwords.words('english')]
    cleaned = [stemmer.stem(word) for word in clean_words]
    
    return cleaned

def sorting_df(df):
    df.dropna(axis=0, inplace=True) # Fjern NaN
    df.drop(df[df['type'] == 'unknown'].index, inplace = True) # drop unknown
    df.drop(df[df['type'] == 'rumor'].index, inplace = True) # fjerner fordi den ikke er på Github-listen
    df.drop_duplicates(subset=['content'], keep='first', inplace=True)

    fake_group = ['fake','satire','bias','conspiracy','junksci','hate','unreliable']
    df['type_binary'] = df['type'].isin(fake_group)
    df['type_binary'] = df['type_binary'].astype(int) # define types
    return df

# Testing clean_text on 250 samples
def process(dataframe):
    df = sorting_df(dataframe)
    df['content'] = df['content'].apply(clean_text)
    return df

In [9]:
# Split into chunks of 100K.

df_1 = df_2mil.loc[0:100000]
df_2 = df_2mil.loc[100001:200000]
df_3 = df_2mil.loc[200001:300000]
df_4 = df_2mil.loc[300001:400000]
df_5 = df_2mil.loc[400001:500000]
df_6 = df_2mil.loc[500001:600000]
df_7 = df_2mil.loc[600001:700000]
df_8 = df_2mil.loc[700001:800000]
df_9 = df_2mil.loc[800001:900000]
df_10 = df_2mil.loc[900001:]

In [10]:
df_1 = process(df_1)
df_1.to_csv('2mill_1_cleaned.csv')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(axis=0, inplace=True) # Fjern NaN
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(df[df['type'] == 'unknown'].index, inplace = True) # drop unknown
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(df[df['type'] == 'rumor'].index, inplace = True) # fjerner fordi den ikke er på Github-listen
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_g

In [11]:
df_2 = process(df_2)
df_2.to_csv('2mill_2_cleaned.csv')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(axis=0, inplace=True) # Fjern NaN
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(df[df['type'] == 'unknown'].index, inplace = True) # drop unknown
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(df[df['type'] == 'rumor'].index, inplace = True) # fjerner fordi den ikke er på Github-listen
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_g

In [12]:
df_3 = process(df_3)
df_3.to_csv('2mill_3_cleaned.csv')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(axis=0, inplace=True) # Fjern NaN
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(df[df['type'] == 'unknown'].index, inplace = True) # drop unknown
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(df[df['type'] == 'rumor'].index, inplace = True) # fjerner fordi den ikke er på Github-listen
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_g

In [13]:
df_4 = process(df_4)
df_4.to_csv('2mill_4_cleaned.csv')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(axis=0, inplace=True) # Fjern NaN
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(df[df['type'] == 'unknown'].index, inplace = True) # drop unknown
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(df[df['type'] == 'rumor'].index, inplace = True) # fjerner fordi den ikke er på Github-listen
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_g

In [14]:
df_5 = process(df_5)
df_5.to_csv('2mill_5_cleaned.csv')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(axis=0, inplace=True) # Fjern NaN
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(df[df['type'] == 'unknown'].index, inplace = True) # drop unknown
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(df[df['type'] == 'rumor'].index, inplace = True) # fjerner fordi den ikke er på Github-listen
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_g

In [15]:
df_6 = process(df_6)
df_6.to_csv('2mill_6_cleaned.csv')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(axis=0, inplace=True) # Fjern NaN
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(df[df['type'] == 'unknown'].index, inplace = True) # drop unknown
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(df[df['type'] == 'rumor'].index, inplace = True) # fjerner fordi den ikke er på Github-listen
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_g

In [16]:
df_7 = process(df_7)
df_7.to_csv('2mill_7_cleaned.csv')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(axis=0, inplace=True) # Fjern NaN
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(df[df['type'] == 'unknown'].index, inplace = True) # drop unknown
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(df[df['type'] == 'rumor'].index, inplace = True) # fjerner fordi den ikke er på Github-listen
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_g

In [17]:
df_8 = process(df_8)
df_8.to_csv('2mill_8_cleaned.csv')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(axis=0, inplace=True) # Fjern NaN
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(df[df['type'] == 'unknown'].index, inplace = True) # drop unknown
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(df[df['type'] == 'rumor'].index, inplace = True) # fjerner fordi den ikke er på Github-listen
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_g

In [18]:
df_9 = process(df_9)
df_9.to_csv('2mill_9_cleaned.csv')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(axis=0, inplace=True) # Fjern NaN
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(df[df['type'] == 'unknown'].index, inplace = True) # drop unknown
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(df[df['type'] == 'rumor'].index, inplace = True) # fjerner fordi den ikke er på Github-listen
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_g

In [19]:
df_10 = process(df_10)
df_10.to_csv('2mill_10_cleaned.csv')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(axis=0, inplace=True) # Fjern NaN
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(df[df['type'] == 'unknown'].index, inplace = True) # drop unknown
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(df[df['type'] == 'rumor'].index, inplace = True) # fjerner fordi den ikke er på Github-listen
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_g

In [20]:
df_full_cleaned = pd.concat([df_1,df_2,df_3,df_4,df_5,df_6,df_7,df_8,df_9,df_10])
df_full_cleaned.to_csv('2mill_cleaned.csv')

In [21]:
df_full_cleaned

Unnamed: 0,domain,type,content,type_binary
1,wikileaks.org,unreliable,"[tortor, encrypt, anonymis, network, make, har...",1
14,wikileaks.org,unreliable,"[raw, contentconfidenti, page, num, maseru, nu...",1
25,wikileaks.org,unreliable,"[raw, contentlimit, offici, use, page, num, me...",1
26,wikileaks.org,unreliable,"[raw, contentconfidenti, page, num, rabat, num...",1
38,wikileaks.org,unreliable,"[raw, contentlimit, offici, use, page, num, st...",1
...,...,...,...,...
950327,rawstory.com,political,"[republican, hous, repres, wednesday, held, so...",0
950328,ecowatch.com,political,"[num, sign, alec, lose, war, solar]",0
950329,ecowatch.com,political,"[obama, slam, koch, brother, clean, energi, su...",0
950332,attn.com,political,"[pay, attn, share, commentari, news, articl, v...",0


In [None]:
df_1mil = pd.read_csv('1mill_cleaned.csv')

df_final = pd.concat([df_1mil,df_full_cleaned])

df_final.to_csv('600K_cleaned.csv')