### Task 0.
Execute the notebook and complete listed exercises (between CODE_START and CODE_END blocks).

In [2]:
import nltk

nltk.download('twitter_samples')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


True

In [3]:
from nltk.corpus import twitter_samples

In [4]:
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')

positive_tweets[50]

'@groovinshawn they are rechargeable and it normally comes with a charger when u buy it :)'

In [5]:
tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
print(tweet_tokens[50])

['@groovinshawn', 'they', 'are', 'rechargeable', 'and', 'it', 'normally', 'comes', 'with', 'a', 'charger', 'when', 'u', 'buy', 'it', ':)']


In [6]:
nltk.download('averaged_perceptron_tagger_eng')
from nltk.tag import pos_tag

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger_eng.zip.


In [27]:
pos_tag(tweet_tokens[50])

[('@groovinshawn', 'NN'),
 ('they', 'PRP'),
 ('are', 'VBP'),
 ('rechargeable', 'JJ'),
 ('and', 'CC'),
 ('it', 'PRP'),
 ('normally', 'RB'),
 ('comes', 'VBZ'),
 ('with', 'IN'),
 ('a', 'DT'),
 ('charger', 'NN'),
 ('when', 'WRB'),
 ('u', 'JJ'),
 ('buy', 'VB'),
 ('it', 'PRP'),
 (':)', 'JJ')]

In [21]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

Let’s write a function that will lemmatize twitter tokens.

In [22]:
from nltk.stem.wordnet import WordNetLemmatizer
tokens = tweet_tokens[50]

In [23]:
# Create a lemmatizer
lemmatizer = WordNetLemmatizer()

Convert PoS tags into a format used by the lemmatizer using the following rules:

* NN -> n
* VB -> v
* else -> a

In [32]:
def lemmatize_sentence(tokens):
    lemmatized_sentence = []

    # CODE_START
    for word, tag in pos_tag(tokens):
        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        lemmatized_sentence.append(lemmatizer.lemmatize(word, pos))
    # CODE_END

    return lemmatized_sentence

lemmatize_sentence(tokens)

['@groovinshawn',
 'they',
 'be',
 'rechargeable',
 'and',
 'it',
 'normally',
 'come',
 'with',
 'a',
 'charger',
 'when',
 'u',
 'buy',
 'it',
 ':)']

Now we can proceed to processing. During processing, we will perform cleanup:

remove URLs and mentions using regexes
after lemmatization, remove stopwords

In [33]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [34]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
print(len(stop_words))
for i in range(10):
    print(stop_words[i])

198
a
about
above
after
again
against
ain
all
am
an


Now, please write the process_tokens() function. It should be an improved version of lemmatize_sentence() function above.

It should do the following:

1. Iterate through pos_tag(tweet_tokens).
2. Use regex to remove tokens matching URLs or mentions (@somebody).
3. Remove tokens that stop words or are punctuation symbols (use Python’s built-in string.punctuation).
4. Lowercase all tokens
5. Lemmatize using WordNetLemmatizer.
6. Return the list of cleaned_tokens.

In [35]:
import re, string

def process_tokens(tweet_tokens):

    cleaned_tokens = []
    stop_words = stopwords.words('english')
    lemmatizer = WordNetLemmatizer()

    for token, tag in pos_tag(tweet_tokens):
      # CODE_START
        if (re.search(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', token) or 
            re.search(r'(@[A-Za-z0-9_]+)', token)):
            continue

        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
   
        token = lemmatizer.lemmatize(token, pos)

        if token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
      # CODE_END
    return cleaned_tokens

In [36]:
print("Before:", tweet_tokens[50])
print("After:", process_tokens(tweet_tokens[50]))

Before: ['@groovinshawn', 'they', 'are', 'rechargeable', 'and', 'it', 'normally', 'comes', 'with', 'a', 'charger', 'when', 'u', 'buy', 'it', ':)']
After: ['rechargeable', 'normally', 'come', 'charger', 'u', 'buy', ':)']


Now run process_tokens on all positive/negative tokens (use tokenized method as mentioned above).

In [38]:
# CODE_START
positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

positive_cleaned_tokens_list = [process_tokens(tokens) for tokens in positive_tweet_tokens]
negative_cleaned_tokens_list = [process_tokens(tokens) for tokens in negative_tweet_tokens]
# CODE_END

In [39]:
print(positive_tweet_tokens[500])
print(positive_cleaned_tokens_list[500])

['Dang', 'that', 'is', 'some', 'rad', '@AbzuGame', '#fanart', '!', ':D', 'https://t.co/bI8k8tb9ht']
['dang', 'rad', '#fanart', ':d']


Now, let’s check what words are most common.

First, add a helper function get_all_words:

In [40]:
def get_all_words(cleaned_tokens_list):
  # CODE_START
    return [w for tokens in cleaned_tokens_list for w in tokens]
  # CODE_END
all_pos_words = get_all_words(positive_cleaned_tokens_list)

In [41]:
from nltk import FreqDist

# CODE_START
# use all_pos_words
freq_dist_pos = FreqDist(all_pos_words)
print(freq_dist_pos.most_common(10))
# CODE_END

[(':)', 3691), (':-)', 701), (':d', 658), ('thanks', 388), ('follow', 357), ('love', 333), ('...', 290), ('good', 283), ('get', 263), ('thank', 253)]


### Task 1. 
Change the code so it removes hashtags during pre-processing. (E.g. #Ukraine).

In [42]:
def process_tokens_modify(tokens):

    cleaned_tokens = []
    stop_words = stopwords.words('english')
    lemmatizer = WordNetLemmatizer()

    for token, tag in pos_tag(tokens):
        if (re.search(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', token) or 
            re.search(r'(@[A-Za-z0-9_]+)', token) or re.search(r'#[A-Za-z0-9_]+', token)):
            continue

        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
   
        token = lemmatizer.lemmatize(token, pos)

        if token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

In [44]:
positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

positive_cleaned_tokens_list = [process_tokens_modify(tokens) for tokens in positive_tweet_tokens]
negative_cleaned_tokens_list = [process_tokens_modify(tokens) for tokens in negative_tweet_tokens]

In [45]:
print(positive_tweet_tokens[500])
print(positive_cleaned_tokens_list[500])

['Dang', 'that', 'is', 'some', 'rad', '@AbzuGame', '#fanart', '!', ':D', 'https://t.co/bI8k8tb9ht']
['dang', 'rad', ':d']


### Task 3. 
Let’s suppose that semantic distance between words is the distance to the common semantic parent (hypernym). Write a function that will compute this distance between two words.

In [14]:
from nltk.corpus import wordnet as wn

def semantic_distance(word1, word2):
    synsets1 = wn.synsets(word1)[0]
    synsets2 = wn.synsets(word2)[0]
    
    distance = synsets1.shortest_path_distance(synsets2)

    return distance

print(semantic_distance("dog", "cat"))  
print(semantic_distance("car", "rose"))   

4
14
