In [1]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.




In [2]:
import numpy as np
import pandas as pd

In [3]:
train_corpus = pd.read_csv('project2/project2_training_data.txt', delimiter='\n', header=None,names=['sentences'])

In [4]:
train_corpus

Unnamed: 0,sentences
0,Merrill Lynch analyst Campbell Morgan upgraded...
1,Eriikka S+Âderstr+Âm has previously held sever...
2,The webcast may be followed online on the comp...
3,"Typical end-uses include roof structures , flo..."
4,The sale will be finalized in September or Oct...
...,...
1806,With this appointment Kaupthing Bank aims to f...
1807,Jon Risfelt has previously held operational ex...
1808,The group intends to relocate warehouse and of...
1809,"The contract includes software licences , appl..."


In [5]:
train_labels = pd.read_csv('project2/project2_training_data_labels.txt', delimiter='\n', header=None, names=['labels'])

In [6]:
train_labels

Unnamed: 0,labels
0,positive
1,neutral
2,neutral
3,neutral
4,neutral
...,...
1806,positive
1807,neutral
1808,neutral
1809,neutral


In [9]:
train_labels['labels'].value_counts()

neutral     1113
positive     456
negative     242
Name: labels, dtype: int64

In [10]:
df = pd.concat([train_corpus, train_labels], axis=1)

In [11]:
pd.set_option('max_colwidth', 600)    
df[df['labels']=='negative']

Unnamed: 0,sentences,labels
19,"However , the growth margin slowed down due to the financial crisis .",negative
23,"ADPnews - Feb 5 , 2010 - Finnish real estate investor Sponda Oyj HEL : SDA1V said today that it slipped to a net loss of EUR 81.5 million USD 11.8 m in 2009 from a profit of EUR 29.3 million in 2008 .",negative
26,"Alma Media 's operating profit amounted to EUR 11.9 mn , down from EUR 15.0 mn a year earlier .",negative
29,"Operating result showed a loss of EUR 2.9 mn , while a year before , it showed a profit of EUR 0.6 mn .",negative
44,The fair value of the company 's investment properties went down to EUR 2.768 billion at the end of 2009 from EUR 2.916 billion a year earlier .,negative
...,...,...
1756,Consolidated operating profit from continuing operations decreased by 62.3 % to EUR 51.2 mn from EUR 135.7 mn in 2007 .,negative
1764,"Salcomp Oyj , the Finnish maker of mobile phone chargers , Monday posted a EUR1 .49 million loss in the second quarter compared with a 1.70 million profit in the same period the previous year .",negative
1767,Net sales dropped by 6 % year-on-year to EUR 11.9 million .,negative
1773,Finnish business software group AffectoGenimap Oyj said its net profit halved to 1.2 mln euro ( $ 1.5 mln ) in the first nine months of 2006 from 2.2 mln euro ( $ 2.8 mln ) in the same period of 2005 .,negative


### Pre-processing

In [16]:
from nltk.tokenize import word_tokenize, sent_tokenize, wordpunct_tokenize, TreebankWordTokenizer, TweetTokenizer, MWETokenizer
from nltk.corpus import stopwords
from nltk import ngrams
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
import string

In [19]:
stopwords = list(set(stopwords.words("english")) - {"aren't", 'above', 'couldn', "couldn't", 'didn', "didn't",'doesn',"doesn't", 'don',"don't", 'below', 'before', 'down','hadn',
                                               "hadn't",'hasn',"hasn't", 'haven',"haven't","isn't", 'isn', 'mightn',"mightn't", 'mustn', "mustn't", 'needn', "needn't", 'more', 
                                               'further','from','no','nor','not', 'over', 'shan',"shan't", 'shouldn',"shouldn't", 'to','under', 'up','wasn',"wasn't",'weren',"weren't",
                                               'won',"won't",'wouldn', "wouldn't",})

In [29]:
def clean_data(text, stopwords):
    tokens = word_tokenize(text.strip())
        
    lower = [i.lower() for i in tokens]
    
    clean = [j for j in lower if j not in stopwords]
    
    punctuations = list(string.punctuation) + ['+', '-', '*', '/']
    clean = [k.strip(''.join(punctuations)) for k in clean if k not in punctuations]
    
    return ' '.join(clean)

In [30]:
df['cleaned'] = df['sentences'].apply(lambda x: clean_data(x, stopwords))

In [32]:
df.head(10)

Unnamed: 0,sentences,labels,cleaned
0,Merrill Lynch analyst Campbell Morgan upgraded his recommendation on PaperlinX from `` neutral '' to `` buy '' in May .,positive,merrill lynch analyst campbell morgan upgraded recommendation paperlinx from neutral to buy may
1,Eriikka S+Âderstr+Âm has previously held several positions in finance and control at Nokia Networks including acting as the Business Group Controller and having the corporate controller position at Nokia Siemens Networks .,neutral,eriikka s+âderstr+âm previously held several positions finance control nokia networks including acting business group controller corporate controller position nokia siemens networks
2,The webcast may be followed online on the company website at www.ruukki.com/investors .,neutral,webcast may followed online company website www.ruukki.com/investors
3,"Typical end-uses include roof structures , floorings , walls and ceilings , non-visible structures in vehicles , packaging and boxes , construction site structures , fencing and shelters , and formwork with a limited number of concrete pourings .",neutral,typical end-uses include roof structures floorings walls ceilings non-visible structures vehicles packaging boxes construction site structures fencing shelters formwork limited number concrete pourings
4,"The sale will be finalized in September or October , the company said .",neutral,sale finalized september october company said
5,"Finnish steel maker Rautaruukki Oyj ( Ruukki ) said on July 7 , 2008 that it won a 9.0 mln euro ( $ 14.1 mln ) contract to supply and install steel superstructures for Partihallsforbindelsen bridge project in Gothenburg , western Sweden .",positive,finnish steel maker rautaruukki oyj ruukki said july 7 2008 won 9.0 mln euro 14.1 mln contract to supply install steel superstructures partihallsforbindelsen bridge project gothenburg western sweden
6,"The works will include the laying of natural stone pavements and the installation of underground heating , and surface water drainage systems .",neutral,works include laying natural stone pavements installation underground heating surface water drainage systems
7,The order was valued at over EUR15m .,neutral,order valued over eur15m
8,"Finnish metal products company Componenta Oyj ( HEL : CTH1V ) said today its net loss narrowed to EUR 500,000 ( USD 680,000 ) in the last quarter of 2010 from EUR 5.3 million for the same period a year earlier .",positive,"finnish metal products company componenta oyj hel cth1v said today net loss narrowed to eur 500,000 usd 680,000 last quarter 2010 from eur 5.3 million period year earlier"
9,Coffee will be served starting at 14:30 EET as well as after the event .,neutral,coffee served starting 14:30 eet well event


In [22]:
text='This is an elective course'

text.split()

# text="Let me know your mobile#, if that's fine. The call rate is 1.2p/s 😊️."

# print(word_tokenize(text))


['This', 'is', 'an', 'elective', 'course']

In [23]:
word_tokenize(text)

['This', 'is', 'an', 'elective', 'course']

In [21]:
import nltk
from nltk.corpus import stopwords

stops = set(stopwords.words('english'))
print(stops)

{'themselves', 'my', "should've", "needn't", 'and', 'was', 'had', 'at', 'few', 'should', 'am', 'ma', 'too', 're', 'to', 'each', "wasn't", "you'd", 'myself', "isn't", 'for', 'your', 'as', 'until', 'whom', 'their', 've', 'under', 'wasn', 'while', 'shouldn', 'during', 'mightn', 'not', 'most', 'very', 'shan', "mightn't", 'doesn', 'about', 'he', "you've", 'then', 'other', 'nor', 'is', 'where', 'be', 'can', 'doing', 'how', 'being', "doesn't", 'with', 'aren', 'yourself', 'were', 'up', 'have', 'below', "wouldn't", 'a', "don't", 'who', 'between', 'own', 'ain', 'just', 'will', 'didn', 'an', 'hasn', 'won', 'or', 'his', 'by', 'any', 'before', 'hadn', 'having', 'you', 'll', "hadn't", 'only', 'it', 'these', 'o', 'did', 'theirs', 'into', 'yourselves', 'her', 'when', 'all', 'once', "aren't", 'himself', 'on', 'in', 'than', 'because', 'she', 'yours', 'so', 'they', 'no', 'don', 'through', 'we', "it's", 'both', 'here', 't', "didn't", "mustn't", 'after', 'if', 'now', "hasn't", 'those', "shan't", 'this', 'i

In [24]:
stops = sorted(list(stops))

In [25]:
stops

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

"aren't",  'above', 'couldn', "couldn't, 'didn', "didn't",'doesn',"doesn't", 'don',"don't", 'below', 'before', 'down','hadn',"hadn't",'hasn',"hasn't", 'haven',"haven't","isn't", 'isn', 'mightn',"mightn't", 'mustn', "mustn't", 'needn', "needn't", 'more', 'further','from','no','nor','not', 'over', 'shan',"shan't", 'shouldn',"shouldn't", 'to','under', 'up','wasn',"wasn't",'weren',"weren't", 'won',"won't",'wouldn', "wouldn't",



