# Imports and Drive setup



In [0]:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import time
import re
import string


In [3]:
link='https://drive.google.com/open?id=1GfJxKrcClls4zWHUTFMkHM-Z6nebIrdB'
fluff, id = link.split('=')
print (id) # Verify that you have everything after '='
downloaded = drive.CreateFile({'id':id})
downloaded.GetContentFile('1mil6.csv')

1GfJxKrcClls4zWHUTFMkHM-Z6nebIrdB


# Reading the csv into a dataframe

In [0]:
cols = ['sentiment','id','date','query_string','user','text']
df = pd.read_csv("1mil6.csv",header=None , names=cols, engine="python")

In [0]:
df.drop(['id','date','query_string','user'],axis=1,inplace=True)

# Text Cleaning
Minimal text cleaning, consisting of removing irrelevent information (urls, handlers, numbers, ... )

In [0]:
def clean_text(text):
  # sub &amp by '&'
  text = re.sub('&amp;', ' and ', text)

  # remove urls
  # courtesy of 'https://stackoverflow.com/questions/3809401/what-is-a-good-regular-expression-to-match-a-url'
  url_regex = '(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})'
  text = re.sub(url_regex, ' ', text)

  # remove twitter handlers
  twitter_handle_regex = '@([A-Za-z0-9_]+)'
  text = re.sub(twitter_handle_regex, ' ', text)

  # remove UTF-8 BOM chars
  text = re.sub('�', '', text)

  # remove numbers and hashtags
  text = re.sub('([0-9#])', '', text)

  # remove punctuation
  text = text.translate(str.maketrans('', '', string.punctuation))
  
  # remove extra whitespaces
  text = re.sub(' +', ' ', text)

  return text.lower()

In [83]:
clean_text(df['text'][1])

'is upset that he cant update his facebook by texting it and might cry as a result school today also blah'

# Cleaning the dataset and saving the new csv to drive

In [84]:
#text comprehension with progress update every 400k processes.
clean_tweets = [(i % 400000 == 0 and print(i)) or clean_text(text) for i, text in enumerate(df['text'])]

0
400000
800000
1200000


## Saving the new csv to google drive


### Creating a clean dataframe


In [85]:
clean_df = pd.DataFrame(clean_tweets,columns=['text'])
clean_df['sentiment'] = df.sentiment
clean_df.head()

Unnamed: 0,text,sentiment
0,awww thats a bummer you shoulda got david car...,0
1,is upset that he cant update his facebook by t...,0
2,i dived many times for the ball managed to sa...,0
3,my whole body feels itchy and like its on fire,0
4,no its not behaving at all im mad why am i he...,0


### Saving to drive as csv

In [87]:
clean_df.to_csv("clean.csv")
uploaded = drive.CreateFile({'title': 'clean.csv'})
uploaded.SetContentFile("clean.csv")
uploaded.Upload()
print('Uploaded file with ID {}'.format(uploaded.get('id')))

Uploaded file with ID 1XyM6a8dIs6XZJwgsQ86Qx9aWn_SH1QP_
