#Prepare for the dataset

Upload files to Google Colaboratory

Please download imdb_labeled.txt file locally and then use the button below to upload to colab


In [None]:
from google.colab import files
uploaded = files.upload()

# copy of the imdb_labeled.txt file in the same folder as this script copy

Saving imdb_labeled.txt to imdb_labeled (1).txt


The dataset is a TSV file which has a format of instance **\t** label.

Instance: free-text movie comments
Review: 0 or 1. 0 stands for negative and 1 stands for positive

In [None]:
#now we can read the records from the file

instances = []
labels = []

#read all the lines from the file
lines = open("imdb_labeled.txt", "r").readlines()

#we skip the first line since it is header
for line in lines[1:]: #start from index 1 instead of 0
  #remove the new line character at the end of each line
  line = line.strip()
  #split by tab
  elements = line.split('\t')
  #get the instance and label
  instance = elements[0]
  #convert the label to an integer
  label = int(elements[1])
  #add them to the lists
  instances.append(instance)
  labels.append(label)

print ('Load', len(instances), 'instances with', len(labels), 'labels')

Load 3000 instances with 3000 labels


#Exercise 1

In [None]:
# we could also use pandas to load the file. Please complete the following code blocks

import pandas as pd
pd.set_option('display.max_colwidth', 100)
# now use pd.read_csv() to read the dataset and save to a variable called df
# you could look at the documentation to figure out what parameters are needed
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html

df = pd.read_csv('imdb_labeled.txt', sep='\t') #finish your codes here

In [None]:
#let's see how df looks like
df.head(10)

Unnamed: 0,Instance,Label
0,"A very, very, very slow-moving, aimless movie about a distressed, drifting young man.",0
1,"Not sure who was more lost - the flat characters or the audience, nearly half of whom walked out.",0
2,"Attempting artiness with black & white and clever camera angles, the movie disappointed - became...",0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo is trying to find a song that keeps running through...,1
5,"The rest of the movie lacks art, charm, meaning... If it's about emptiness, it works I guess bec...",0
6,Wasted two hours.,0
7,"Saw the movie today and thought it was a good effort, good messages for kids.",1
8,A bit predictable.,0
9,Loved the casting of Jimmy Buffet as the science teacher.,1


In [None]:
#you could try many pandas in-built functions to explore the dataset. For example,
# how many instances have labels 1?

print(len(df[df['Label']==1]))

1500


In [None]:
#for example, get the first row

print(df.iloc[0])

Instance    A very, very, very slow-moving, aimless movie about a distressed, drifting young man.  
Label                                                                                             0
Name: 0, dtype: object


You could explore pandas related functions later

#One-hot encoder

Note that we should do text processing at this step before generating text representations. Since you have just learned and practiced text processing methods, let's skip this for now so we have more time on the new topic.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
#binary=True means that if a word occurs will be 1 otherwise 0
#for demonstration, I set the max_features to 50 which looks for most 50 frequent words
#in practice, you do not need to set this parameter specifically
#let's remove stopwords
vectorizer = CountVectorizer(binary=True, max_features=50)

#we can directly pass the instance of df as the input to the vectorizer
#this will generate the matrix
onehot = vectorizer.fit_transform(df['Instance'])

#this will print the shape of the matrix
print(onehot.shape)


(3000, 50)


In [None]:
#let's put it into a pandas dataframe for visualization

pd.set_option('display.max_columns', None)
onehot_df = pd.DataFrame(onehot.toarray(), columns=vectorizer.get_feature_names_out())

onehot_df.head(20)

Unnamed: 0,all,an,and,are,as,at,bad,be,but,film,food,for,from,good,great,had,has,have,if,in,is,it,just,like,movie,my,not,of,on,one,out,phone,place,really,service,so,that,the,there,they,this,time,to,very,was,we,well,were,with,you
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0
2,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,1,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0


#TF encoder

#Exercise 2

In [None]:
###Exercise: please create a TF encoder rather than one-hot. For demonstration,
### please also set max_features to 50 for consistency
vectorizer = CountVectorizer(binary=False, max_features=50) #start your codes here



In [None]:
#now we can use the created TF encoder to generate the matrix again

tf = vectorizer.fit_transform(df['Instance'])
tf_df = pd.DataFrame(tf.toarray(), columns=vectorizer.get_feature_names_out())

tf_df.head(20)

Unnamed: 0,all,an,and,are,as,at,bad,be,but,film,food,for,from,good,great,had,have,if,in,is,it,just,like,movie,my,not,of,on,one,out,phone,place,really,service,so,that,the,there,they,this,time,to,very,was,we,well,were,with,would,you
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,2,0,0,0,0,0,0,1,0,0,0,0,0,0
2,0,0,3,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,1,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,2,0,0,0,0,1,0,1,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,3,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,0,0,1,0,0,0,0,0,0,0,0,1,0,2,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0


#TF-IDF encoder

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=50)

tfidf = vectorizer.fit_transform(df['Instance'])
tfidf_df = pd.DataFrame(tfidf.toarray(), columns=vectorizer.get_feature_names_out())

tfidf_df.head(10)

Unnamed: 0,all,an,and,are,as,at,bad,be,but,film,food,for,from,good,great,had,have,if,in,is,it,just,like,movie,my,not,of,on,one,out,phone,place,really,service,so,that,the,there,they,this,time,to,very,was,we,well,were,with,would,you
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.337933,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.94117,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.431264,0.35116,0.0,0.0,0.583691,0.0,0.0,0.0,0.0,0.0,0.0,0.468833,0.0,0.0,0.0,0.0,0.0,0.0,0.360831,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.586008,0.0,0.36529,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.348687,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.491354,0.0,0.0,0.0,0.0,0.0,0.0,0.25211,0.0,0.0,0.0,0.310785,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.519602,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.509949,0.685541,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.369284,0.301522,0.0,0.0,0.0,0.461247,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.399412,0.433312,0.0,0.0,0.0,0.0,0.318525,0.0,0.333493,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.415511,0.0,0.0,0.71306,0.0,0.0,0.366227,0.0,0.0,0.257694,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.344047,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.216248,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.326622,0.0,0.723883,0.0,0.0,0.0,0.0,0.0,0.0,0.250531,0.0,0.0,0.386017,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.181319,0.0,0.0,0.0,0.0,0.0,0.0,0.2791,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.665887,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.447253,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.597126,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
