This notebook clean up the text, take off any special characters.
This code is adapted from group member Juhee Sung-Schenck 

In [1]:
# import libraries

import pandas as pd
import numpy as np
import re

from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.stem import WordNetLemmatizer 

In [2]:
# read in the clean dataset

df = pd.read_csv('../data/cleaned.csv')

In [3]:
print(df.shape)

df.head(3)

(19999, 6)


Unnamed: 0,id,text,user_name,date,lat,long
0,1305657570774978560,Smoke from California wildfires causes hazy sk...,FireandAviation,2020-09-14 23:59:57+00:00,37.280386,-122.128109
1,1304359517552095235,96% Overwhelmingly Positive Reviews! Grab a fr...,JoinDeepRock,2020-09-11 10:01:57+00:00,36.052556,-119.40677
2,1305657551703412736,So a 2-3 degree temperature difference is what...,matthew_paul17,2020-09-14 23:59:52+00:00,35.839572,-122.243498


In [4]:
## count sentences in each tweet (row)

# set up an emty list for sentence counts

n_sentence = []

# iterate through the text column

for i in range(len(df['text'])):
    
    n_sentence.append(len(sent_tokenize(df['text'][i].lower())))
    
# store it in the dataframe

df['n_sentence'] = n_sentence

In [5]:
# instantiate lemmatizer and tokenizer

lemmatizer = WordNetLemmatizer()
tokenizer = RegexpTokenizer(r'\w+|\$[\d\.]+|\S+')

In [6]:
## tokenize and lemmatize

# create a column 'text_clean'

df['text_clean'] = ''

# iterate through each row in the column text_all
    
for i in range(len(df['text'])):
        
    # tokenize each word in text into its own string
    text_token = []
    text_token.extend(tokenizer.tokenize(df['text'][i].lower()))
    text_tokens = []
    [text_tokens.append(word) for word in text_token if word not in text_tokens]
        
    # lemmatize the words
    text_lemmatize = []
    for j in range(len(text_tokens)):
        text_lemmatize.append(lemmatizer.lemmatize(text_tokens[j]))
        
    # remove characters and numbers
    clean_text = []
    for k in range(len(text_lemmatize)):
        clean_text.append(re.sub('[^a-zA-Z]', '', text_lemmatize[k]))    
        
    # group them together
    texts_collection = [text for text in clean_text]
    
    # put the words back to one long string for vectorization
    texts_collection = ' '.join(texts_collection)

    # fill new column
    df['text_clean'][i] = texts_collection

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [7]:
## count words in each tweet (row)

# set up an empty list for word counts

n_words = []

# iterate through the text column

for i in range(len(df['text_clean'])):
    
    word_tokens = tokenizer.tokenize(df['text_clean'][i])
    cnt = len(word_tokens)
    n_words.append(cnt)
    
# store it in the dataframe

df['n_words'] = n_words

In [8]:
df.head()

Unnamed: 0,id,text,user_name,date,lat,long,n_sentence,text_clean,n_words
0,1305657570774978560,Smoke from California wildfires causes hazy sk...,FireandAviation,2020-09-14 23:59:57+00:00,37.280386,-122.128109,1,smoke from california wildfire cause hazy sky ...,9
1,1304359517552095235,96% Overwhelmingly Positive Reviews! Grab a fr...,JoinDeepRock,2020-09-11 10:01:57+00:00,36.052556,-119.40677,2,overwhelmingly positive review grab a frien...,15
2,1305657551703412736,So a 2-3 degree temperature difference is what...,matthew_paul17,2020-09-14 23:59:52+00:00,35.839572,-122.243498,1,so a degree temperature difference is what c...,36
3,1305657540227850240,@POTUS is right about the #wildfires Expert in...,Josie95450522,2020-09-14 23:59:50+00:00,34.897778,-119.722077,2,potus is right about the wildfires expert inve...,39
4,1305657538281533440,Wildfires in California ARE caused by poor lan...,Arqahn,2020-09-14 23:59:49+00:00,38.370841,-120.107434,1,wildfire in california are caused by poor land...,9


In [9]:
# reorder the order of the columns

df = df[['id', 'user_name','lat', 'long', 'date', 'text', 'text_clean', 'n_sentence', 'n_words']]

In [10]:
df.head()

Unnamed: 0,id,user_name,lat,long,date,text,text_clean,n_sentence,n_words
0,1305657570774978560,FireandAviation,37.280386,-122.128109,2020-09-14 23:59:57+00:00,Smoke from California wildfires causes hazy sk...,smoke from california wildfire cause hazy sky ...,1,9
1,1304359517552095235,JoinDeepRock,36.052556,-119.40677,2020-09-11 10:01:57+00:00,96% Overwhelmingly Positive Reviews! Grab a fr...,overwhelmingly positive review grab a frien...,2,15
2,1305657551703412736,matthew_paul17,35.839572,-122.243498,2020-09-14 23:59:52+00:00,So a 2-3 degree temperature difference is what...,so a degree temperature difference is what c...,1,36
3,1305657540227850240,Josie95450522,34.897778,-119.722077,2020-09-14 23:59:50+00:00,@POTUS is right about the #wildfires Expert in...,potus is right about the wildfires expert inve...,2,39
4,1305657538281533440,Arqahn,38.370841,-120.107434,2020-09-14 23:59:49+00:00,Wildfires in California ARE caused by poor lan...,wildfire in california are caused by poor land...,1,9


In [11]:
# check for null values

df.isnull().sum()

id            0
user_name     0
lat           0
long          0
date          0
text          0
text_clean    0
n_sentence    0
n_words       0
dtype: int64

In [12]:
# save this dataframe to a csv file

df.to_csv('../data/final.csv', index = False)