## Example Code for Loading and Manipulating Data
- Loading multiple files (Movie Reviews)
- Netvizz
- Political Mashup

In [None]:
# for more detail have a look at Notebook 5.2 Reading and Writing Files
import os # import the operating system library
import codecs
import nltk

### Loading a collection of files

In [None]:
# to use this example download this file and unzip it: http://www.cs.cornell.edu/people/pabo/movie-review-data/review_polarity.tar.gz
path_to_neg_reviews = 'review_polarity/txt_sentoken/neg/' # assuming this folder is in same directory as the notebook

In [None]:
reviews = os.listdir(path_to_neg_reviews) # list all the files in the folder 
print(reviews[:10])

In [None]:
from nltk.tokenize import regexp_tokenize

neg_reviews_tokens_all = [] # here we will store all the tokens
for review in reviews:
    review_text = open(os.path.join(path_to_neg_reviews,review),'r').read() # open and read the file
    # if this fails try codecs.open(os.path.join(path_to_neg_reviews,review),'r',encoding='utf-8').read()
    tokens = regexp_tokenize(review_text,pattern='\w+') # tokenize the text
    neg_reviews_tokens_all.extend(tokens) # add tokens to the list

In [None]:
neg_reviews_nlkt_text = nltk.text.Text(neg_reviews_tokens_all) # convert list of tokens to an NLTK Text object
neg_reviews_nlkt_text.concordance('awful') # use the NLTK Text methods, see Notebook 4.2 for more examples

### Loading Data from Netvizz

In [None]:
path_to_netvizz_files = 'page_15704546335_2018_01_24_10_02_35/' # set the path to your files

In [None]:
netvizz_files = os.listdir(path_to_netvizz_files) # list all the files in this directory
print(netvizz_files)

In [None]:
topcomments_path = os.path.join(path_to_netvizz_files,'page_15704546335_2018_01_24_10_02_35_topcomments.tab')
print(topcomments_path)

In [None]:
topcomments = open(topcomments_path,'r').read().strip() # open the files, delete trailing whitespaces with the .strip() method
print(topcomments[:100])

In [None]:
rows = topcomments.split('\n')
print(len(rows))
print(rows[0]) # the first row is the header
print(rows[1]) # the second row a top comment

In [None]:
data = []
for row in rows[1:]: # ignore the first row which is the header
    cells = row.split('\t') # cells are tab separated, a tab in python is "\t"
    data.append(cells) # add the cells to the data list


In [None]:
data[:3]

In [None]:
taco_bell = []
# collect all posts mentioning Taco Bell
for row in data:
    if 'taco bell' in row[1].lower(): # text is saved as the second item (location index 1) in each row, we also lowercase the text
        taco_bell.append(row) # append the row to the taco bell list

In [None]:
print(taco_bell)

### Opening and loading data from Political Mashup

In [None]:
import csv # we need another module here because the separator (',') also appears in the text
text = open('hits.csv','r').read()
data = [row for row in csv.reader(text.split('\n'), quotechar='"', delimiter=',')]

In [None]:
print(data[0])
print(data[1])
print(data[1][-1])