# Mehmet Kubilay Gulacdi

## Import Libraries

In [1]:
import pandas as pd
import warnings
import re
import nltk
import string
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer

warnings.filterwarnings('ignore') 
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\K\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

## Read txt and define token identifiers
We need to define "gold", "golden" and "silver" word(token) that will be our identifiers.

In [2]:
file = open("sample.txt", "r", encoding="utf8")
documents = file.read()

gold_tokens = ['gold', 'golden']
silver_tokens = ['silver']

In [3]:
documents



## Sentence and word tokenize

- Firstly, we need to split the whole documents with using sent_tokenize().
- Secondly, we use our sentence variable and use word_tokenize to obtain tokens of every sentence in the documents.

In [4]:
sentences = sent_tokenize(documents)
tokens = [word_tokenize(sentence.lower()) for sentence in sentences]

In [5]:
sentences

['@fansoniclove Gold the Tenrec\nTokyo-bound Sampson sets Aust rifle record.',
 'Shooter Dane Sampson has struck career-best form as he builds towards a third Olympics, setting a national record while winning the 50m rifle event at the South Australia championships.',
 'Sampson registered a score of 462 points to claim gold in the three positions event.',
 "The performance bettered Sampson's own national record of 460.7 points, which he achieved at last month's Wingfield grand prix.",
 "The score was also notably higher than what Italy's Niccolo Campriani (458.8) and Poland's Tomasz Bartnik (460.4) produced to win gold at the 2016 Olympics and 2018 world championships respectively.",
 '"It\'s good to be shooting PBs at this stage.',
 'It was a world-class finals score," Sampson said, having previously competed at the 2012 and 2016 Olympics.',
 '"You are unlikely to lose many competitions with that score.',
 '"I definitely feel that I am getting better and better and I am tracking well 

In [6]:
tokens

[['@',
  'fansoniclove',
  'gold',
  'the',
  'tenrec',
  'tokyo-bound',
  'sampson',
  'sets',
  'aust',
  'rifle',
  'record',
  '.'],
 ['shooter',
  'dane',
  'sampson',
  'has',
  'struck',
  'career-best',
  'form',
  'as',
  'he',
  'builds',
  'towards',
  'a',
  'third',
  'olympics',
  ',',
  'setting',
  'a',
  'national',
  'record',
  'while',
  'winning',
  'the',
  '50m',
  'rifle',
  'event',
  'at',
  'the',
  'south',
  'australia',
  'championships',
  '.'],
 ['sampson',
  'registered',
  'a',
  'score',
  'of',
  '462',
  'points',
  'to',
  'claim',
  'gold',
  'in',
  'the',
  'three',
  'positions',
  'event',
  '.'],
 ['the',
  'performance',
  'bettered',
  'sampson',
  "'s",
  'own',
  'national',
  'record',
  'of',
  '460.7',
  'points',
  ',',
  'which',
  'he',
  'achieved',
  'at',
  'last',
  'month',
  "'s",
  'wingfield',
  'grand',
  'prix',
  '.'],
 ['the',
  'score',
  'was',
  'also',
  'notably',
  'higher',
  'than',
  'what',
  'italy',
  "'s",
 

- So now we have tokens of every sentence in the documents. 
- Basically, we need to count our identifiers in our tokens. If a sentence contains identifier, then it must be counted.
- Finally, we print our gold_count and silver_count variables to see number of documents containing gold and silver reference.

## Counting number of documents(sentences) containing gold&silver reference

In [7]:
gold_count = 0
silver_count = 0

for token_list in tokens:
    if any(token in gold_tokens for token in token_list):
        gold_count += 1
        
    if any(token in silver_tokens for token in token_list):
        silver_count += 1

print("Number of documents containing gold reference:", gold_count)
print("Number of documents containing silver reference:", silver_count)

Number of documents containing gold reference: 609
Number of documents containing silver reference: 116


- We will calculate the sentiment of sentences and print out overall.
- We have some preprocessing part that cleans our text in order to increase sentiment scores.
- doc means a sentence.

## Text Preprocessing

- We define some functions that helps me to remove unneccessary parts of text which are punctuations, numbers, stopwords and emoji. These are typical preprocesses for NLP.
- we set these functions to use consecutively.

In [8]:
def remove_punctuation(doc):
  text = doc.split() # basic tokenizer 
  text = [w for w in doc if w not in string.punctuation] # if words in a sentence(doc) contains punctuation, then we will remove.
  return "".join(text)

def remove_numbers(doc):
  text = [w for w in doc if not w.isdigit()] # removing numbers which is unneccessary
  return "".join(text)

def remove_stopwords(doc):
  text = [w for w in doc.split() if w not in stopwords.words("english")] # removing stopwords
  return " ".join(text) 

def remove_emoji(text):
    return re.sub(
        r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F700-\U0001F77F\U0001F780-\U0001F7FF\U0001F800-\U0001F8FF\U0001F900-\U0001F9FF\U0001FA00-\U0001FA6F\U0001FA70-\U0001FAFF\U0001FB00-\U0001FBFF\U0001FE00-\U0001FE0F\U0001F004]+',
        '', text) # removing emoji using regular expression

In [9]:
# We can easily manipulate text data with using tabular format such as dataframe so we create a dataframe

df = pd.DataFrame(columns=['Sentence'])

for sentence in sentences:
    df = df.append({'Sentence': sentence}, ignore_index=True)

In [10]:
# With apply-lambda duo, we can reach every row (sentence) and we can clean them with our defined functions.
# First we do lowercase and then we remove punctuations and then we remove numbers and finally we remove stopwords.
# Finally, we create a new column in our dataframe called Clean Sentence in order to continue with this column for sentiment

df["Clean_Sentence"] = df["Sentence"].apply(lambda doc: remove_stopwords
                                            (remove_numbers
                                             (remove_punctuation
                                              (remove_emoji
                                               (doc.lower())))))

In [11]:
df.head()

Unnamed: 0,Sentence,Clean_Sentence
0,@fansoniclove Gold the Tenrec\nTokyo-bound Sam...,fansoniclove gold tenrec tokyobound sampson se...
1,Shooter Dane Sampson has struck career-best fo...,shooter dane sampson struck careerbest form bu...
2,Sampson registered a score of 462 points to cl...,sampson registered score points claim gold thr...
3,The performance bettered Sampson's own nationa...,performance bettered sampsons national record ...
4,The score was also notably higher than what It...,score also notably higher italys niccolo campr...


In [12]:
df.shape
# we need number of row because for "for" loop.

(5847, 2)

## Sentiment Analysis

In [13]:
analyzer = SentimentIntensityAnalyzer()
gold_sentiments = []
silver_sentiments = []
clean_sentences = df["Clean_Sentence"] # we define a variable for readability

for index in range(0, 5847):
    # we reach sentences from 0 to 5847th index sentence respectively and we tokenized them and hold as "token_list"
    token_list = word_tokenize(clean_sentences.loc[index]) 
    
    if any(token in gold_tokens for token in token_list):  
    # if token_list has gold_tokens(identifier) then we calculate polarity scores and append our gold_sentiments list
        gold_sentiments.append(analyzer.polarity_scores(' '.join(token_list))['compound'])
        
    if any(token in silver_tokens for token in token_list):
    # same process for silver sentiments
        silver_sentiments.append(analyzer.polarity_scores(' '.join(token_list))['compound'])
        
# We calculate overall sentiment value for gold and silver
print("Overall sentiment value for gold:", sum(gold_sentiments) / len(gold_sentiments) if gold_sentiments else 0)
print("Overall sentiment value for silver:", sum(silver_sentiments) / len(silver_sentiments) if silver_sentiments else 0)


Overall sentiment value for gold: 0.19689226973684212
Overall sentiment value for silver: 0.23296896551724136


In [14]:
# Sentiment scores of each sentence referencing Gold
gold_sentiments

[0.0,
 0.0,
 0.8555,
 0.296,
 0.4404,
 -0.296,
 0.34,
 0.2023,
 -0.2023,
 0.3612,
 -0.6124,
 0.0,
 0.926,
 0.5574,
 0.4767,
 0.875,
 0.8807,
 0.0,
 0.0,
 0.0,
 0.0,
 -0.539,
 -0.6486,
 -0.128,
 -0.0516,
 0.3182,
 0.5423,
 0.2263,
 0.0,
 0.0,
 0.4588,
 0.8481,
 0.8316,
 0.0,
 0.0,
 0.6597,
 0.3818,
 0.0,
 -0.1027,
 0.0,
 0.0,
 0.8807,
 0.9403,
 0.8689,
 0.0,
 0.7096,
 0.0,
 0.0,
 0.6249,
 0.0,
 0.7351,
 0.4019,
 0.128,
 0.4019,
 0.9001,
 0.4404,
 0.0,
 0.802,
 0.0,
 -0.7783,
 -0.0516,
 0.0,
 0.3818,
 0.0,
 0.0,
 0.1531,
 0.4404,
 0.4588,
 -0.0258,
 0.0,
 0.6486,
 0.0,
 -0.5574,
 0.3612,
 0.5994,
 0.5267,
 0.1263,
 0.4215,
 -0.2732,
 0.0,
 -0.0516,
 0.2732,
 0.0,
 -0.4019,
 0.6705,
 -0.296,
 -0.4019,
 0.0935,
 0.7506,
 0.8402,
 0.0,
 -0.6808,
 0.9081,
 -0.4767,
 0.0,
 0.886,
 -0.1027,
 0.0,
 0.4767,
 0.891,
 0.9313,
 0.8555,
 0.4588,
 0.0,
 0.2732,
 0.8225,
 0.8591,
 0.0,
 -0.6597,
 -0.3818,
 -0.4767,
 0.4404,
 0.0,
 0.2023,
 0.4939,
 0.0,
 0.0,
 -0.128,
 0.0,
 0.6249,
 0.6486,
 0.5994,


In [15]:
# Sentiment scores of each sentence referencing Silver
silver_sentiments

[0.875,
 0.6808,
 0.9403,
 0.4019,
 0.4019,
 0.4404,
 0.4588,
 0.1263,
 -0.2732,
 0.0,
 0.2732,
 0.9081,
 0.0,
 0.6597,
 0.7184,
 0.0,
 0.5859,
 0.0,
 0.4404,
 0.4404,
 0.0,
 0.4767,
 -0.7579,
 0.0,
 -0.4215,
 0.0,
 0.0,
 0.5719,
 0.0,
 0.0,
 0.9337,
 0.743,
 0.3182,
 0.6124,
 0.0,
 0.0772,
 0.765,
 0.2732,
 0.959,
 0.296,
 0.8074,
 0.4019,
 0.0,
 -0.4404,
 0.0,
 0.0,
 0.0,
 0.4939,
 0.9714,
 0.2129,
 0.7579,
 0.0,
 0.4767,
 0.4767,
 0.0,
 0.0,
 -0.0258,
 0.0,
 -0.296,
 0.0516,
 0.0258,
 -0.4588,
 -0.9547,
 -0.4019,
 0.0,
 -0.2263,
 0.9468,
 0.9246,
 0.7003,
 0.0772,
 0.6249,
 0.8008,
 0.0,
 0.0,
 0.0,
 0.2732,
 0.4939,
 0.0,
 0.0,
 0.7506,
 0.296,
 0.0,
 -0.4588,
 0.0,
 0.0,
 0.0,
 -0.6124,
 0.0,
 0.507,
 0.0,
 0.0,
 0.0,
 -0.7906,
 0.872,
 0.34,
 0.4215,
 0.2263,
 0.2023,
 0.0,
 0.0,
 0.4019,
 0.0,
 0.8126,
 0.9432,
 0.7506,
 0.0,
 0.1779,
 0.4215,
 -0.1531,
 0.7906,
 0.7717,
 0.5719,
 0.3818,
 0.0,
 -0.5267,
 0.2874]