In [7]:
import pandas as pd
import sklearn 
import nltk
import numpy as np 
import matplotlib.pyplot as ply
import scipy
import re
import lxml.html
import requests

In [8]:
#functions to clean the data
def de_hashtag(row):
    temp = row['text'].split()
    res = [string for string in temp if string[0]=='#']
    return ' '.join(res)
def websites(row):
    regex = '((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*'
    r = re.compile(regex)
    temp = row['text'].split()
    res = list(filter(r.match,temp))
    return ' '.join(res)
def phone_numbers(row):
    regex = '^(\+\d{1,2}\s)?\(?\d{3}\)?[\s.-]\d{3}[\s.-]\d{4}$'
    r = re.compile(regex)
    temp = row['text'].split()
    res = list(filter(r.match,temp))
    if len(res)>0:
        return str(res[0])
    return np.nan
def clean_text(df):
    df['clean_text'] = df['text'].apply(lambda elem: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", elem))
    return df


In [9]:
#remove records if no explicit hashtags
def addHashtags(string):
    if str(string) != 'nan':
        return '#'+str(string)
def location(row):
    temp = row['hashtags'].split()
    res = [word for word in temp if word in list(hashtags['1'])]
    return ' '.join(res)
def explicit(row):
    temp = row['hashtags'].split()
    res = [word for word in temp if word in list(hashtags['2'])]
    return ' '.join(res)

In [16]:
#load area codes
cols = ['area_code','city','state','country_code','lat','lng']
col_geo = ['area_code','lat','lng']
area_codes = pd.read_csv('https://raw.githubusercontent.com/ravisorg/Area-Code-Geolocation-Database/master/us-area-code-cities.csv',header=None,names=cols)
area_codes_geo = pd.read_csv('https://raw.githubusercontent.com/ravisorg/Area-Code-Geolocation-Database/master/us-area-code-geo.csv',header=None,names=col_geo)
area_codes.area_code= area_codes.area_code.astype(int)
area_codes_geo.area_code= area_codes_geo.area_code.astype(int)
#inital laod of data
additional = pd.read_csv('data/eleventhrun.csv',header=0)
df = pd.read_csv('data/tenthrun.csv',header=0)
df.drop_duplicates(subset=['text','date'],inplace=True)
df  =  pd.concat([df,additional],ignore_index=True )
df = df.loc[:,['text','date']]
# add hastags to extra row
df['hashtags']= df.apply(de_hashtag,axis=1)
#get unique websites
df['websites']= df.apply(websites,axis=1)
# phone numbers
df['phone_numbers']= df.apply(phone_numbers,axis=1)
# add unique count to each set of tweets
#df['unique_count']=df.groupby('text')['date'].rank(method='first')
df['area_code'] = df.phone_numbers.str[:3]
df['area_code'].fillna('999999999',inplace=True)
df.area_code = df.area_code.astype(int)
df= df.merge(area_codes[['area_code','state','country_code']],left_on='area_code',right_on='area_code',how='left')
df= df.merge(area_codes_geo[['area_code','lat','lng']],left_on='area_code',right_on='area_code',how='left')
df.area_code = df.area_code.astype(object)
df.replace({'area_code':999999999},{'area_code':np.nan},regex=True,inplace=True)
df.drop_duplicates(subset=['text','date'],inplace=True)
df = clean_text(df)
df

Unnamed: 0,text,date,hashtags,websites,phone_numbers,area_code,state,country_code,lat,lng,clean_text
0,"To a GREAT Friend &amp; Kind Heart, here are A...",2020-12-07 15:57:29+00:00,#WakeUpCall #FloydLittle #Little #Syracuse #Cu...,https://t.co/Ho2RA8efJ2 https://t.co/1RrhcB4016,,,,,,,To a GREAT Friend amp Kind Heart here are ALL ...
1,remember when #Syracuse was a football blue bl...,2020-10-20 22:46:37+00:00,#Syracuse #brown #mcnabb #graves #morris #czon...,https://t.co/qzZS0jwaG4,,,,,,,remember when Syracuse was a football blue blo...
2,#Syracuse #orange #autograph #floyd #little #j...,2020-04-18 08:39:16+00:00,#Syracuse #orange #autograph #floyd #little #j...,https://t.co/40pmQZGC8j,,,,,,,Syracuse orange autograph floyd little jersey ...
3,RT @FloppZilla: Legends! #GOAT #SYRACUSE #Cson...,2019-04-14 14:28:29+00:00,#GOAT #SYRACUSE #Csonka #Little #Coughlin #Boe...,https://t.co/OBohpJXpdM,,,,,,,RT Legends GOAT SYRACUSE Csonka Little Coughl...
4,RT @FloppZilla: Legends! #GOAT #SYRACUSE #Cson...,2019-04-14 05:15:17+00:00,#GOAT #SYRACUSE #Csonka #Little #Coughlin #Boe...,https://t.co/OBohpJXpdM,,,,,,,RT Legends GOAT SYRACUSE Csonka Little Coughl...
...,...,...,...,...,...,...,...,...,...,...,...
9518,"Gewoon, omdat dit heel fijn is om naar te kijk...",2018-03-07 09:12:22+00:00,#hunk #gtst #bing #model #shoot…,https://t.co/P7DMp9eEGb,,,,,,,Gewoon omdat dit heel fijn is om naar te kijke...
9519,Hi stranger!! #bing #djbing #westbywang #swag ...,2018-02-22 05:08:47+00:00,#bing #djbing #westbywang #swag #slowmo #kuala...,https://t.co/rnnNTETeUb,,,,,,,Hi stranger bing djbing westbywang swag slowmo...
9520,My kind of Cho2~\n#bing #djbing #westbywang #h...,2018-02-17 11:31:49+00:00,#bing #djbing #westbywang #happycny #model #ac...,https://t.co/2I63GfHkac,,,,,,,My kind of Cho2bing djbing westbywang happycny...
9521,New Christian Community Website https://t.co/W...,2018-02-15 19:13:25+00:00,#about #allcategory #bing #graphics #hr #inter...,https://t.co/Wo4KppoJl8,,,,,,,New Christian Community Website about allcate...


In [17]:
df.to_csv('Data/cleanData.csv')