In [1]:
import string
import pandas as pd
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords

In [2]:
def token_lem_tweets(tweet):
    '''Cleans the raw Tweets and produces 3 word ngrams of the Tweets.
    
    Keyword argument:
    tweet -- the Tweet to be processed
    
    Return:
    tokenized words from the tweet text

    ''' 
    # Create the tokenizer and lemmatizer
    tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
    lemmatizer = WordNetLemmatizer()

    # Tokenize the Tweet
    tokenized = tokenizer.tokenize(tweet)
    
    # Some of the below code was found here: 
    # https://www.youtube.com/watch?v=7N_2OsLXFlA&list=PLmcBskOCOOFW1SNrz6_yzCEKGvh65wYb9&index=19
    # Create variables that store the punctuation and stopwords to be removed
    punctuation = list(string.punctuation)
    swords = stopwords.words('english') + punctuation + ['rt', 'via', 
                                                         '...', 'u', 
                                                         'ur', 'r', 
                                                         'covid', 'coronavirus', 
                                                         'covid19'
                                                        ]

    # Create a list of lemmatized words, remove punctuation, stopwords and numbers
    token_list = [lemmatizer.lemmatize(word) 
                  for word in tokenized 
                  if word not in swords 
                  and not word.isdigit()]
    
    return token_list



In [3]:
reduced_df = pd.DataFrame(columns=['date', 'words', 'id'])

In [4]:
# bump up chunksize on final go and get rid of nrows arg
# alltweetsdata is of length 4217125, 100k chunk size in theory leads to about 43 loops. 

cols = ['id', 'timestamp', 'text']
for chunk in pd.read_csv("allTweetData.csv", usecols=cols, chunksize=10**5):
    all_tweets = chunk.drop_duplicates()

    # Change the timestamp to datetime and create a date column
    all_tweets['timestamp'] = pd.to_datetime(all_tweets['timestamp'])
    all_tweets['date']  = all_tweets['timestamp'].dt.date

    # Change all tweets to lowercase and remove any non-ASCII characters
    # Found code from: 
    # https://stackoverflow.com/questions/36340627/remove-non-ascii-characters-from-pandas-column
    all_tweets['text'] = all_tweets['text'].str.lower().str.encode('ascii', 'ignore'
                                                                  ).str.decode('ascii')

    # Remove all links and multiple hashes for one hashtagged word
    all_tweets = all_tweets.replace({'text': {r"http\S+": "", '#{1,}': ""}}, regex=True)

    # Add the tokenized words column
    all_tweets['words'] = all_tweets['text'].apply(token_lem_tweets)

    # drop columns and explode the tokens to their own rows
    all_tweets.drop(columns=['text', 'timestamp'], inplace=True)
    all_tweets = all_tweets.explode('words')

    # groupby stage
    all_tweets = all_tweets.groupby(['date', 'words']).count()
    all_tweets.reset_index(inplace=True)

    # reduce stage
    reduced = all_tweets[all_tweets['id'] > 20]
    
    reduced_df = reduced_df.append(reduced, ignore_index=True)
    print(len(reduced_df))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


12698
25502
38006
49778


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


62330
74317
85421
96888


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


108797
120854
132240
144578


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


156388
169260


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


181410
191753
202211
212443
222698
233275
243799
254352
264545
274792
285220
295663
306513
316832
326805


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


337268


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


347911
358110
368382
379442
390594
400810
411111
422178
433188


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


444177
454695
464946
466985


In [5]:
reduced_df.sort_values(by='id', ascending=False)

Unnamed: 0,date,words,id
133546,2020-03-22,corona,12592
145058,2020-03-20,corona,10670
147719,2020-03-24,corona,9905
146471,2020-03-23,corona,9679
354574,2020-08-26,exam,7699
...,...,...,...
387031,2020-06-13,strength,21
214126,2020-07-16,praise,21
214114,2020-07-16,postive,21
214091,2020-07-16,poisoned,21


In [19]:
x = list(reduced_df['date'].unique())

In [7]:
reduced_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 466985 entries, 0 to 466984
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   date    466985 non-null  object
 1   words   466985 non-null  object
 2   id      466985 non-null  object
dtypes: object(3)
memory usage: 10.7+ MB


In [22]:
reduced_df.to_csv('word_counts_data.csv')

In [8]:
# read in first 100k rows test

# cols = ['id', 'timestamp', 'text']
# chunk = pd.read_csv("allTweetData.csv", usecols=cols, nrows=10**5)
# chunk.columns

# all_tweets = chunk.drop_duplicates()

# # Change the timestamp to datetime and create a date column
# all_tweets['timestamp'] = pd.to_datetime(all_tweets['timestamp'])
# all_tweets['date']  = all_tweets['timestamp'].dt.date

# # Change all tweets to lowercase and remove any non-ASCII characters
# # Found code from: 
# # https://stackoverflow.com/questions/36340627/remove-non-ascii-characters-from-pandas-column
# all_tweets['text'] = all_tweets['text'].str.lower().str.encode('ascii', 'ignore'
#                                                               ).str.decode('ascii')

# # Remove all links and multiple hashes for one hashtagged word
# all_tweets = all_tweets.replace({'text': {r"http\S+": "", '#{1,}': ""}}, regex=True)

# # Add the tokenized words column
# all_tweets['words'] = all_tweets['text'].apply(token_lem_tweets)

# # drop columns and explode the tokens to their own rows
# all_tweets.drop(columns=['text', 'timestamp'], inplace=True)
# all_tweets = all_tweets.explode('words')

# # groupby stage
# all_tweets = all_tweets.groupby(['date', 'words']).count()
# all_tweets.reset_index(inplace=True)

# # reduce stage
# all_tweets = all_tweets[all_tweets['id'] > 10]

In [9]:
# print('length: ', len(all_tweets), 'column names: ', all_tweets.colummns)
# all_tweets.sort_values(by='id', ascending=False)
# all_tweets.info()