In [1]:
import pandas as pd
import nltk
from nltk.corpus import movie_reviews

# NLTK movie reviwes dataset

### Save all in a list of tuples

In [2]:
documents = [(movie_reviews.raw(fileid), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

### Convert to DF

In [3]:
reviewdf = pd.DataFrame()
for review, category in documents:
    temp = pd.DataFrame(data={'text':review, 'category':category}, index=[0])
    reviewdf = reviewdf.append(temp) 

In [4]:
reviewdf.reset_index(drop=True, inplace=True)
reviewdf.head()

Unnamed: 0,category,text
0,neg,"plot : two teen couples go to a church party ,..."
1,neg,the happy bastard's quick movie review \ndamn ...
2,neg,it is movies like these that make a jaded movi...
3,neg,""" quest for camelot "" is warner bros . ' firs..."
4,neg,synopsis : a mentally unstable man undergoing ...


### Give category the desired format

In [5]:
reviewdf['category'] = reviewdf['category'].map(lambda x: 0 if x=='neg' else 1)

### Save as csv
Maybe not enough data

In [6]:
len(reviewdf)

2000

In [7]:
reviewdf.to_csv(path_or_buf='/home/jfreek/workspace/Mining_The_Social_Web/datasets/moviereviews.csv', 
                header=['category', 'text'], columns=['category', 'text'], index=None, sep='\t', mode='w')

In [8]:
# to read it:
nltk_df = pd.read_csv('/home/jfreek/workspace/Mining_The_Social_Web/datasets/moviereviews.csv', 
                      sep='\t', header=0, names=['category', 'text'])
nltk_df.head()

Unnamed: 0,category,text
0,0,"plot : two teen couples go to a church party ,..."
1,0,the happy bastard's quick movie review \ndamn ...
2,0,it is movies like these that make a jaded movi...
3,0,""" quest for camelot "" is warner bros . ' firs..."
4,0,synopsis : a mentally unstable man undergoing ...


# Tweets from Stanford

In [9]:
tweets_df = pd.read_csv('/home/jfreek/workspace/Mining_The_Social_Web/datasets/tweetsstanford_training.csv', 
                       sep=',', header=None, names=['category', 'id', 'date', 'query', 'user', 'text'])

In [10]:
tweets_df['category'] = tweets_df['category'].map(lambda x: 1 if x==4 else 0)
tweets_df[['category', 'text']].head()

Unnamed: 0,category,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


# Umich Data

In [11]:
umich_df = pd.read_csv('/home/jfreek/workspace/Mining_The_Social_Web/datasets/umich_training.txt', 
                       sep="\t", header = None, names=['category', 'text'])

In [12]:
umich_df.head()

Unnamed: 0,category,text
0,1,The Da Vinci Code book is just awesome.
1,1,this was the first clive cussler i've ever rea...
2,1,i liked the Da Vinci Code a lot.
3,1,i liked the Da Vinci Code a lot.
4,1,I liked the Da Vinci Code but it ultimatly did...


# Amazon

In [13]:
amazon_df = pd.read_csv('/home/jfreek/workspace/Mining_The_Social_Web/datasets/amazon_cells_labelled.txt', 
	sep="\t", header = None, names=['text', 'category'])
amazon_df.head()

Unnamed: 0,text,category
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


# IMDB

In [14]:
imdb_df = pd.read_csv('/home/jfreek/workspace/Mining_The_Social_Web/datasets/imdb_labelled.txt', 
	sep="\t", header = None, names=['text', 'category'])
imdb_df.head()

Unnamed: 0,text,category
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


# Yelp

In [15]:
yelp_df = pd.read_csv('/home/jfreek/workspace/Mining_The_Social_Web/datasets/yelp_labelled.txt', 
	sep="\t", header = None, names=['text', 'category'])
yelp_df.head()

Unnamed: 0,text,category
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


# ALL

In [16]:
trainset_df = pd.concat([nltk_df, tweets_df[['category', 'text']], umich_df, yelp_df,imdb_df, amazon_df])

In [17]:
trainset_df.head(10)

Unnamed: 0,category,text
0,0,"plot : two teen couples go to a church party ,..."
1,0,the happy bastard's quick movie review \ndamn ...
2,0,it is movies like these that make a jaded movi...
3,0,""" quest for camelot "" is warner bros . ' firs..."
4,0,synopsis : a mentally unstable man undergoing ...
5,0,capsule : in 2176 on the planet mars police ta...
6,0,"so ask yourself what "" 8mm "" ( "" eight millime..."
7,0,that's exactly how long the movie felt to me ....
8,0,call it a road trip for the walking wounded . ...
9,0,plot : a young french boy sees his parents kil...


In [18]:
trainset_df.to_csv(path_or_buf='/home/jfreek/workspace/Mining_The_Social_Web/datasets/alltrainset.csv', 
                header=['category', 'text'], columns=['category', 'text'], index=None, sep='\t', mode='w')