# Re-grouping streaming text files + Cleaning

## Import 

In [1]:
from bs4 import BeautifulSoup
import re
import numpy as np
import pandas as pd
import os

## Path

In [2]:
stream_path = "tweets/"
stream_files = [os.path.join(stream_path, f) for f in os.listdir(stream_path)]
print stream_files

['tweets/stream_2016_11_09_10-29-31.txt', 'tweets/stream_2016_11_11_10-26-28.txt']


In [3]:
ACCEPTED_CARACTERS = ['\n', ' ', '!', '"', "'", '(', ')', '*', ',', '-', '.', ':', ';', '?', '[', ']', '_'] + \
['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'] + \
['%d'%i for i in range(10)] + \
['/', '@']

## Custom functions

In [4]:
def open_and_read(f):
    with open(f, "r") as f:
        lines = f.readlines()
    header = lines[0:3]
    tweets = lines[3:]
    return header, tweets

In [5]:
def remove_emoji(data):
    """
    去除表情
    :param data:
    :return:
    """
    if not data:
        return data
    if not isinstance(data, basestring):
        return data
    try:
    # UCS-4
        patt = re.compile(u'([\U00002600-\U000027BF])|([\U0001f300-\U0001f64F])|([\U0001f680-\U0001f6FF])')
    except re.error:
    # UCS-2
        patt = re.compile(u'([\u2600-\u27BF])|([\uD83C][\uDF00-\uDFFF])|([\uD83D][\uDC00-\uDE4F])|([\uD83D][\uDE80-\uDEFF])')
    return patt.sub('', data)

In [6]:
def extract_next_tweet(indice, tweets):
    tweet = tweets[indice]
    indice += 1
    count = 0
    while tweet.find("<end>") < 0 and count < 100:
        tweet += tweets[indice]
        indice += 1
        count += 1
    return tweet, indice

def tweet_cleaning(tweet, accepted_caracters=[], emoji_cleaning=True):
    # Removing the 'start', 'end' markers
    t = tweet.replace("<start>", "")
    t = t.replace("<end>\n", "")
    t = t.replace("text:", "")
    
    # Reject tweet with non-accepted characters
    if accepted_caracters:
        if set(list(t.lower())).issubset(set(accepted_caracters)):
            return t
        else:
            return None
    return t

# Test 

### Open a stream file

In [7]:
header, tweets = open_and_read(stream_files[0])
print "HEADER :\n", header
print "TWEETS :\n", tweets[0:4]

HEADER :
['Description: No_filtering\n', "Queries: ['a', 'c', 'b', 'e', 'd', 'g', 'f', 'i', 'h', 'k', 'j', 'm', 'l', 'o', 'n', 'q', 'p', 's', 'r', 'u', 't', 'w', 'v', 'y', 'x', 'z']\n", "Keys: ['text']\n"]
TWEETS :
["<start>text:Yall being so extra about Trump. He doesn't have all the power yall think he does  !!!<end>\n", '<start>text:I give up.<end>\n', "<start>text:@chelseaaabailey oh god know, please don't \xf0\x9f\x98\x82\xf0\x9f\x98\x82\xf0\x9f\x98\x82<end>\n", "<start>text:RT @offclASTRO_PH: [161110] ASTRO 'AUTUMN STORY' COUNTDOWN #\xec\x95\x84\xec\x8a\xa4\xed\x8a\xb8\xeb\xa1\x9c #\xea\xb3\xa0\xeb\xb0\xb1 #ASTROCOMEBACK\n"]


### Extract a tweet at position x

In [8]:
x = 0
tweet, new_x = extract_next_tweet(x, tweets)
print tweet

<start>text:Yall being so extra about Trump. He doesn't have all the power yall think he does  !!!<end>



### Cleaning a tweet

- Removing smileys
- Removing markers (such as "<start>", "<end>"...)
- Labelize

In [30]:
tweet, new_x = extract_next_tweet(new_x, tweets)
cleaned_tweet = tweet_cleaning(tweet)
print cleaned_tweet
#print "Label :", label
#print "Smiley :", smiley

RT @lhfang: Hillary team's strategy of directly &amp; implicitly (#ImWithHer) accusing *all* critics of sexism fueled bitter resentment on the…


## Extracting/Cleaning all tweets available

In [134]:
tweets = []

tweet_count = 0
for stream_file in stream_files[0:1]:
    print "\n", stream_file
    # Open and read file
    header, ts = open_and_read(stream_file)
    # Extracting tweet loop
    counter = 0
    while counter < len(ts)-1:
        # Extract
        t, counter = extract_next_tweet(counter, ts)
        # Clean
        t = tweet_cleaning(t, accepted_caracters=ACCEPTED_CARACTERS, emoji_cleaning=False)
        
        # Store
        if t is not None:
            tweets.append(t)
            # Tweet count
            tweet_count += 1
        print "\rTweet count = %d"%tweet_count,


tweets/stream_2016_10_03_15-56-05.txt
Tweet count = 26989


In [135]:
tweets

["you mentioning a fake account doesn't add up also. :) https://t.co/E6n9Jw9iZX",
 '@meltingreality @ariandazm first come first serve :)',
 '@ApexCris You should take me to the lighthouse. I can be your good luck charm :)',
 "@MirandaCosgrove You're so amazing!!! :D 809",
 '@UnidadesDePaium ouch :(',
 '@Codi_whoelse @Cassie_Daggs this is too much for that sweet boy to take :(',
 '@darklylacquered ..b.oth? :D',
 "@CBCArts @agotoronto You're welcome! I'll let my student know you liked it :)",
 "@jbarro @fud31 @BreitbartNews Yet their tax returns are available and they aren't even running for President :)",
 '@TheRedPika Alright, thanks dude! All the cash goes toward a new drawing tablet, if they like it please have them keep sharing! :)',
 'RT @ml_kayla: here are some random pics :) https://t.co/rZbKOaXu0J',
 '@x3Larsen nothing just checking up on u :)',
 "That feeling when you're trash and get carried to the lighthouse :) @WiLL9700 @_WillFear",
 '@hellcasecom Good luck :)',
 '@cabelloqu

In [58]:
with open("first_samples.pkl", "w") as f:
    pickle.dump(tweets, f)

In [59]:
import pickle
with open("first_samples.pkl", "r") as f:
    tweets = pickle.load(f)