# NLP — Natural Language Processing

Download Twitter Dataset

In [1]:
import nltk                              
from nltk.corpus import twitter_samples  

In [2]:
# nltk.download('twitter_samples')

In [3]:
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

In [4]:
print('Number of positive tweets: ', len(all_positive_tweets))
print('Number of negative tweets: ', len(all_negative_tweets))

Number of positive tweets:  5000
Number of negative tweets:  5000


In [5]:
print(all_positive_tweets[0])

#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)


# Tokenization

Bütün bir yazıyı oluşturan her bir sözcüğü ayırma işlemidir.

In [6]:
from nltk.tokenize import word_tokenize  
from nltk.tokenize import TweetTokenizer 

In [7]:
print(all_positive_tweets[0].split(" "))

['#FollowFriday', '@France_Inte', '@PKuchly57', '@Milipol_Paris', 'for', 'being', 'top', 'engaged', 'members', 'in', 'my', 'community', 'this', 'week', ':)']


In [8]:
wrd_tknz = word_tokenize(all_positive_tweets[0])
print(wrd_tknz)

['#', 'FollowFriday', '@', 'France_Inte', '@', 'PKuchly57', '@', 'Milipol_Paris', 'for', 'being', 'top', 'engaged', 'members', 'in', 'my', 'community', 'this', 'week', ':', ')']


In [9]:
tokenizer = TweetTokenizer(strip_handles=True)

# tokenize tweets
print(tokenizer.tokenize(all_positive_tweets[0]) )

['#FollowFriday', 'for', 'being', 'top', 'engaged', 'members', 'in', 'my', 'community', 'this', 'week', ':)']


# Lemmatization

Lemmatization kelimeleri morfolojik olarak inceler. Bir örnek olarak: “Gidiyorlar” gitmek fiilinin üçüncü çoğul şahsının geniş zamanda çekiminden oluşur. Burada kelimenin çekimlenmemiş ilk haline lemma denir, bu örnekte gitmek bir lemmadır. 

In [10]:
import nltk 
# nltk.download('wordnet') 
from nltk.stem import WordNetLemmatizer 
  
# Create WordNetLemmatizer object 
wnl = WordNetLemmatizer()

In [11]:
for words in wrd_tknz: 
    print(words + " ---> " + wnl.lemmatize(words)) 

# ---> #
FollowFriday ---> FollowFriday
@ ---> @
France_Inte ---> France_Inte
@ ---> @
PKuchly57 ---> PKuchly57
@ ---> @
Milipol_Paris ---> Milipol_Paris
for ---> for
being ---> being
top ---> top
engaged ---> engaged
members ---> member
in ---> in
my ---> my
community ---> community
this ---> this
week ---> week
: ---> :
) ---> )


In [12]:
# pip install -U spacy

In [13]:
import spacy

sp = spacy.load("en_core_web_sm")

wrd =sp(all_positive_tweets[0]) 

for word in wrd:
    print(word.text+ " ---> "+ word.lemma_)

# ---> #
FollowFriday ---> FollowFriday
@France_Inte ---> @france_inte
@PKuchly57 ---> @PKuchly57
@Milipol_Paris ---> @milipol_paris
for ---> for
being ---> be
top ---> top
engaged ---> engaged
members ---> member
in ---> in
my ---> -PRON-
community ---> community
this ---> this
week ---> week
:) ---> :)


In [14]:
# pip install textblob

In [15]:
from textblob import TextBlob, Word 
  
sentence = all_positive_tweets[0]
  
s = TextBlob(sentence) 
for w in s.words:
    print(w + " ---> " + w.lemmatize())

FollowFriday ---> FollowFriday
France_Inte ---> France_Inte
PKuchly57 ---> PKuchly57
Milipol_Paris ---> Milipol_Paris
for ---> for
being ---> being
top ---> top
engaged ---> engaged
members ---> member
in ---> in
my ---> my
community ---> community
this ---> this
week ---> week
