# 1. Metin Ön İşleme

In [1]:
import nltk
import textblob
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from textblob import TextBlob, Word
from warnings import filterwarnings
filterwarnings("ignore")

In [2]:
metin = """
A Scandal in Bohemia! 01
The Red-headed League,2
A Case, of Identity 33
The Boscombe Valley Mystery4
The Five Orange Pips1
The Man with? the Twisted Lip
The Adventure of the Blue Carbuncle
The Adventure of the Speckled Band
The Adventure of the Engineer's Thumb
The Adventure of the Noble Bachelor
The Adventure of the Beryl Coronet
The Adventure of the Copper Beeches
"""

metin

"\nA Scandal in Bohemia! 01\nThe Red-headed League,2\nA Case, of Identity 33\nThe Boscombe Valley Mystery4\nThe Five Orange Pips1\nThe Man with? the Twisted Lip\nThe Adventure of the Blue Carbuncle\nThe Adventure of the Speckled Band\nThe Adventure of the Engineer's Thumb\nThe Adventure of the Noble Bachelor\nThe Adventure of the Beryl Coronet\nThe Adventure of the Copper Beeches\n"

## A. Metinlerin Vektörlere Dönüştürülmesi

In [3]:
v_metin = metin.split("\n")
v_metin = pd.Series(v_metin)
v_metin

0                                          
1                  A Scandal in Bohemia! 01
2                   The Red-headed League,2
3                    A Case, of Identity 33
4              The Boscombe Valley Mystery4
5                     The Five Orange Pips1
6             The Man with? the Twisted Lip
7       The Adventure of the Blue Carbuncle
8        The Adventure of the Speckled Band
9     The Adventure of the Engineer's Thumb
10      The Adventure of the Noble Bachelor
11       The Adventure of the Beryl Coronet
12      The Adventure of the Copper Beeches
13                                         
dtype: object

In [4]:
metin_vektoru = v_metin[1:len(v_metin)]
metin_vektoru

1                  A Scandal in Bohemia! 01
2                   The Red-headed League,2
3                    A Case, of Identity 33
4              The Boscombe Valley Mystery4
5                     The Five Orange Pips1
6             The Man with? the Twisted Lip
7       The Adventure of the Blue Carbuncle
8        The Adventure of the Speckled Band
9     The Adventure of the Engineer's Thumb
10      The Adventure of the Noble Bachelor
11       The Adventure of the Beryl Coronet
12      The Adventure of the Copper Beeches
13                                         
dtype: object

In [5]:
mdf = pd.DataFrame(metin_vektoru, columns = ["Hikayeler"])
mdf

Unnamed: 0,Hikayeler
1,A Scandal in Bohemia! 01
2,"The Red-headed League,2"
3,"A Case, of Identity 33"
4,The Boscombe Valley Mystery4
5,The Five Orange Pips1
6,The Man with? the Twisted Lip
7,The Adventure of the Blue Carbuncle
8,The Adventure of the Speckled Band
9,The Adventure of the Engineer's Thumb
10,The Adventure of the Noble Bachelor


## B. Büyük/Küçük Harf

In [6]:
mdf["Hikayeler"] = mdf["Hikayeler"].apply(lambda x: " ".join(x.lower() for x in x.split()))
mdf

Unnamed: 0,Hikayeler
1,a scandal in bohemia! 01
2,"the red-headed league,2"
3,"a case, of identity 33"
4,the boscombe valley mystery4
5,the five orange pips1
6,the man with? the twisted lip
7,the adventure of the blue carbuncle
8,the adventure of the speckled band
9,the adventure of the engineer's thumb
10,the adventure of the noble bachelor


## C. Noktalama İşaretlerinin Kaldırılması 

In [7]:
mdf["Hikayeler"] = mdf["Hikayeler"].str.replace("[^\w\s]","")
mdf

Unnamed: 0,Hikayeler
1,a scandal in bohemia 01
2,the redheaded league2
3,a case of identity 33
4,the boscombe valley mystery4
5,the five orange pips1
6,the man with the twisted lip
7,the adventure of the blue carbuncle
8,the adventure of the speckled band
9,the adventure of the engineers thumb
10,the adventure of the noble bachelor


## D. Sayıların Kaldırılması

In [8]:
mdf["Hikayeler"] = mdf["Hikayeler"].str.replace("\d","")
mdf

Unnamed: 0,Hikayeler
1,a scandal in bohemia
2,the redheaded league
3,a case of identity
4,the boscombe valley mystery
5,the five orange pips
6,the man with the twisted lip
7,the adventure of the blue carbuncle
8,the adventure of the speckled band
9,the adventure of the engineers thumb
10,the adventure of the noble bachelor


## E. Stopwords Kaldırılması

In [9]:
nltk.download("stopwords", quiet = True)

True

In [10]:
sw = stopwords.words("english")
mdf["Hikayeler"] = mdf["Hikayeler"].apply(lambda x: " ".join(x for x in x.split() if x not in sw))
mdf

Unnamed: 0,Hikayeler
1,scandal bohemia
2,redheaded league
3,case identity
4,boscombe valley mystery
5,five orange pips
6,man twisted lip
7,adventure blue carbuncle
8,adventure speckled band
9,adventure engineers thumb
10,adventure noble bachelor


## F. Az Geçen Kelimelerin Kaldırılması

In [11]:
pd.Series(" ".join(mdf["Hikayeler"]).split()).value_counts()

adventure    6
carbuncle    1
beryl        1
five         1
bohemia      1
redheaded    1
lip          1
identity     1
twisted      1
orange       1
boscombe     1
scandal      1
man          1
bachelor     1
beeches      1
band         1
engineers    1
case         1
blue         1
mystery      1
thumb        1
noble        1
pips         1
valley       1
copper       1
coronet      1
speckled     1
league       1
dtype: int64

* Sondan üç karakteri silelim.

In [12]:
sil = pd.Series(" ".join(mdf["Hikayeler"]).split()).value_counts()[-3:]
sil

coronet     1
speckled    1
league      1
dtype: int64

In [13]:
mdf["Hikayeler"] = mdf["Hikayeler"].apply(lambda x: " ".join(x for x in x.split() if x not in sil))
mdf

Unnamed: 0,Hikayeler
1,scandal bohemia
2,redheaded
3,case identity
4,boscombe valley mystery
5,five orange pips
6,man twisted lip
7,adventure blue carbuncle
8,adventure band
9,adventure engineers thumb
10,adventure noble bachelor


## G. Tokenization (Cümleleri Kelimelere Ayırıp Yeni Liste Oluşturmak)

In [14]:
nltk.download("punkt", quiet = True)

True

In [15]:
tokenize = mdf["Hikayeler"].apply(lambda x: TextBlob(x).words)
tokenize = pd.DataFrame(tokenize)
tokenize

Unnamed: 0,Hikayeler
1,"[scandal, bohemia]"
2,[redheaded]
3,"[case, identity]"
4,"[boscombe, valley, mystery]"
5,"[five, orange, pips]"
6,"[man, twisted, lip]"
7,"[adventure, blue, carbuncle]"
8,"[adventure, band]"
9,"[adventure, engineers, thumb]"
10,"[adventure, noble, bachelor]"


## H. Stemming (Kelimeleri Köklerine Ayırma)

In [16]:
st = PorterStemmer()
mdf["Hikayeler"].apply(lambda x: " ".join([st.stem(i) for i in x.split()]))

1            scandal bohemia
2                    redhead
3                 case ident
4     boscomb valley mysteri
5             five orang pip
6              man twist lip
7     adventur blue carbuncl
8              adventur band
9       adventur engin thumb
10    adventur nobl bachelor
11            adventur beryl
12     adventur copper beech
13                          
Name: Hikayeler, dtype: object

## I. Lemmatization

In [17]:
nltk.download("wordnet", quiet = True)

True

In [18]:
mdf["Hikayeler"].apply(lambda x: " ".join(Word(i).lemmatize() for i in x.split()))

1              scandal bohemia
2                    redheaded
3                case identity
4      boscombe valley mystery
5              five orange pip
6              man twisted lip
7     adventure blue carbuncle
8               adventure band
9     adventure engineer thumb
10    adventure noble bachelor
11             adventure beryl
12      adventure copper beech
13                            
Name: Hikayeler, dtype: object