In [1]:
import numpy as np
from sklearn import svm
import nltk
import requests
import operator
import random
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest


nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\tianbai\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
# load IMDB data
path_train_pos = "../datasets_coursework1/IMDb/train/imdb_train_pos.txt"
path_train_neg = "../datasets_coursework1/IMDb/train/imdb_train_neg.txt"
path_test_pos = "../datasets_coursework1/IMDb/test/imdb_test_pos.txt"
path_test_neg = "../datasets_coursework1/IMDb/test/imdb_test_neg.txt"
path_dev_pos = "../datasets_coursework1/IMDb/dev/imdb_dev_pos.txt"
path_dev_neg = "../datasets_coursework1/IMDb/dev/imdb_dev_neg.txt"

dataset_train_pos =open(path_train_pos,'rb').readlines()
dataset_train_neg =open(path_train_neg,'rb').readlines()
dataset_test_pos =open(path_test_pos,'rb').readlines()
dataset_test_neg =open(path_test_neg,'rb').readlines()
dataset_dev_pos =open(path_dev_pos,'rb').readlines()
dataset_dev_neg =open(path_dev_neg,'rb').readlines()

In [3]:
print ("Number of positive training data: "+str(len(dataset_train_pos))+"\n")
print ("Number of negative training data: "+str(len(dataset_train_neg))+"\n")
print ("Number of positive testing data: "+str(len(dataset_test_pos))+"\n")
print ("Number of negative testing data: "+str(len(dataset_test_neg))+"\n")
print ("Number of positive develop data: "+str(len(dataset_dev_pos))+"\n")
print ("Number of negative develop data: "+str(len(dataset_dev_neg))+"\n")

for i in dataset_train_pos[:5]:
    print(i)

Number of positive training data: 7483

Number of negative training data: 7517

Number of positive testing data: 2499

Number of negative testing data: 2501

Number of positive develop data: 2518

Number of negative develop data: 2482

b'For fans of Chris Farley, this is probably his best film. David Spade plays the perfect cynical, sarcastic yin to Farley\'s "Baby Huey" yang. Farley achieves strokes of comic genius in his monologues, like the "Let\'s say you\'re driving along the road with your family..." bit, the "Jo-Jo the Idiot Circus Boy with a pretty new pet, (his possible sale)" speech, or the "Glue-sniffing Guarantee fairy" brake pad sale. The sappy moments in the film contrast sharply with Farley and Spade\'s shenanigans. Even after many viewings, it\'s still fun to see Farley pour everything he had into the role. "Richard, what\'s HAPPENING to me?!?!"\n'
b"Fantastic, Madonna at her finest, the film is funny and her acting is brilliant. It may have been made in the 80's but it

In [4]:
train_set=[]
test_set=[]

# combine positive and negitive review together as train set and test set
# and decode byte data to string
for pos_review in dataset_train_pos:
  pos_review_str = pos_review.decode();
  train_set.append((pos_review_str,1))
for neg_review in dataset_train_neg:
  neg_review_str = neg_review.decode(); 
  train_set.append((neg_review_str,0))
random.shuffle(train_set)

for pos_review in dataset_test_pos:
  pos_review_str = pos_review.decode();
  test_set.append((pos_review_str,1))
for neg_review in dataset_test_neg:
  neg_review_str = neg_review.decode();
  test_set.append((neg_review_str,0))
random.shuffle(test_set)

print ("Size of training set: "+str(len(train_set)))
print ("Size of test set: "+str(len(test_set)))




Size of training set: 15000
Size of test set: 5000


In [6]:
lemmatizer = nltk.stem.WordNetLemmatizer()

def get_list_tokens(string):
  sentence_split=nltk.tokenize.sent_tokenize(string)
  list_tokens=[]
  for sentence in sentence_split:
    list_tokens_sentence=nltk.tokenize.word_tokenize(sentence)
    for token in list_tokens_sentence:
      list_tokens.append(lemmatizer.lemmatize(token).lower())
  return list_tokens


# First, we get the stopwords list from nltk
stopwords=set(nltk.corpus.stopwords.words('english'))
# We can add more words to the stopword list, like punctuation marks
stopwords.add(".")
stopwords.add(",")
stopwords.add("--")
stopwords.add("``")
stopwords.add("#")
stopwords.add("@")
stopwords.add(":")
stopwords.add("1")
stopwords.add("0")
stopwords.add("/")
stopwords.add(">")
stopwords.add("<")
stopwords.add("(")
stopwords.add(")")


# Now we create a frequency dictionary with all words in the dataset
# This can take a few minutes depending on your computer, since we are processing more than ten thousand sentences

# Function taken from Session 1
def get_list_tokens(string): # Function to retrieve the list of tokens from a string
  sentence_split=nltk.tokenize.sent_tokenize(string)
  list_tokens=[]
  for sentence in sentence_split:
    list_tokens_sentence=nltk.tokenize.word_tokenize(sentence)
    for token in list_tokens_sentence:
      list_tokens.append(lemmatizer.lemmatize(token).lower())
  return list_tokens


def get_list_ADJ_VERB_tokens(string): # Function to retrieve the list of tokens from a string
  sentence_split=nltk.tokenize.sent_tokenize(string)
  list_tokens=[]
  for sentence in sentence_split:
    list_tokens_sentence=nltk.tokenize.word_tokenize(sentence)
    list_tokens_sentence_tag =nltk.pos_tag(list_tokens_sentence);
    #print(list_tokens_sentence_tag)
    for token,tag in list_tokens_sentence_tag:
      if tag =='ADJ' or tag =='VB' or tag =='VBP' or tag =='VBG' :
          list_tokens.append(lemmatizer.lemmatize(token).lower())
  return list_tokens

# Function taken from Session 2
def get_vector_text(list_vocab,string):
  vector_text=np.zeros(len(list_vocab))
  list_tokens_string=get_list_tokens(string)
  for i, word in enumerate(list_vocab):
    if word in list_tokens_string:
      vector_text[i]=list_tokens_string.count(word)
  return vector_text


# Functions slightly modified from Session 2

def get_vocabulary(training_set, num_features, type_features): # Function to retrieve vocabulary
  dict_word_frequency={}
  for instance in training_set:
    sentence_tokens = []
    if type_features == "totalWordsFrequency":
        sentence_tokens=get_list_tokens(instance[0])#for feature type 1
    if type_features == "Adj_VerbWordsFrequency":
        sentence_tokens=get_list_ADJ_VERB_tokens(instance[0])#for feature type 2
    else:
        sentence_tokens=get_list_tokens(instance[0])#default
    for word in sentence_tokens:
      if word in stopwords: continue
      if word not in dict_word_frequency: dict_word_frequency[word]=1
      else: dict_word_frequency[word]+=1
  sorted_list = sorted(dict_word_frequency.items(), key=operator.itemgetter(1), reverse=True)[:num_features]
  vocabulary=[]
  for word,frequency in sorted_list:
    vocabulary.append(word)
  i=0  
  for word,frequency in sorted_list[:10]:
      i+=1
      print (str(i)+". "+word+" - "+str(frequency))
  return vocabulary



def train_svm_classifier(training_set, vocabulary): # Function for training our svm classifier
  X_train=[]
  Y_train=[]
  for instance in training_set:
    vector_instance=get_vector_text(vocabulary,instance[0])
    X_train.append(vector_instance)
    Y_train.append(instance[1])
    
  X_train = np.asarray(X_train)
  Y_train = np.asarray(Y_train) 
    
    
  fs_sentanalysis=SelectKBest(chi2, k=1000).fit(X_train, Y_train)
  X_train_new = fs_sentanalysis.transform(X_train)   

  #X_train_new = SelectKBest(chi2, k=1000).fit_transform(X_train, Y_train)
  print ("Size original training matrix: "+str(X_train.shape))
  print ("Size new training matrix: "+str(X_train_new.shape))  
    
  # Finally, we train the SVM classifier 
  svm_clf=svm.SVC(kernel="linear",gamma='auto')
  #svm_clf.fit(np.asarray(X_train),np.asarray(Y_train))
  svm_clf.fit(X_train_new,Y_train)
  return svm_clf

In [7]:
# add features of the total frequency of words to vocabulary
vocabulary=get_vocabulary(train_set, 1000,"totalWordsFrequency")  # We use the get_vocabulary function to retrieve the vocabulary
print("total num of features:"+str(len(vocabulary)))

1. br - 59548
2. 's - 36104
3. movie - 29648
4. wa - 29577
5. film - 26929
6. '' - 19857
7. n't - 19640
8. one - 15987
9. ! - 14847
10. like - 11876
total num of features:1000


In [8]:
# add features of the frequency of ADJ and VERB words to vocabulary,and this can take a while...
vocabulary.extend(get_vocabulary(train_set, 1000,"Adj_VerbWordsFrequency"))  
print("total num of features:"+str(len(vocabulary)))

1. see - 6605
2. get - 5218
3. make - 4502
4. think - 4141
5. know - 3735
6. watch - 3619
7. say - 3153
8. 've - 2974
9. 'm - 2795
10. watching - 2602
total num of features:2000


In [9]:
#Add features of key words to vocabulary, by using CountVectorizer 

X_train=[]

for instance in train_set:
    X_train.append(instance[0])
  
vectorizer = CountVectorizer(max_features = 1000)
X = vectorizer.fit_transform(X_train)
word = vectorizer.get_feature_names()

print ("Number of feature: "+str(len(word))+"\n")
#print( '\nvocabulary dic :\n\n',vectorizer.vocabulary_)

vocabulary.extend(word)  
print("total num of features:"+str(len(vocabulary)))

Number of feature: 1000

total num of features:3000


In [None]:
# reduce dimension from 3000 to 1000 and train SVM classifier. This can take for a long time...
svm_clf=train_svm_classifier(train_set, vocabulary) 

Size original training matrix: (15000, 3000)
Size new training matrix: (15000, 1000)


In [113]:
print (svm_clf.predict(fs_sentanalysis.transform([get_vector_text(vocabulary," This was a complete waste of celluloid. The preview was promising but after watching the mov")])))

ValueError: X.shape[1] = 3000 should be equal to 1000, the number of features at training time

In [114]:
X_test=[]
Y_test=[]
for instance in test_set:
  vector_instance=get_vector_text(vocabulary,instance[0])
  X_test.append(vector_instance)
  Y_test.append(instance[1])
X_test=np.asarray(X_test)
Y_test_gold=np.asarray(Y_test)
Y_text_predictions=svm_clf.predict(fs_sentanalysis.transform(X_test))
print(classification_report(Y_test_gold, Y_text_predictions))

ValueError: X.shape[1] = 3000 should be equal to 1000, the number of features at training time

In [100]:
dev_set=[]
for pos_review in dataset_dev_pos:
  pos_review_str = pos_review.decode();
  dev_set.append((pos_review_str,1))
for neg_review in dataset_test_neg:
  neg_review_str = neg_review.decode();
  dev_set.append((neg_review_str,0))
random.shuffle(dev_set)


X_dev=[]
Y_dev=[]
for instance in dev_set:
  vector_instance=get_vector_text(vocabulary,instance[0])
  X_dev.append(vector_instance)
  Y_dev.append(instance[1])
X_test=np.asarray(X_dev)
Y_test_gold=np.asarray(Y_dev)
Y_text_predictions=svm_clf.predict(X_test)
print(classification_report(Y_test_gold, Y_text_predictions))

              precision    recall  f1-score   support

           0       0.85      0.85      0.85      2501
           1       0.85      0.86      0.85      2518

   micro avg       0.85      0.85      0.85      5019
   macro avg       0.85      0.85      0.85      5019
weighted avg       0.85      0.85      0.85      5019

