In [1]:
import pandas as pd

In [3]:
data = pd.read_csv("gender_classifier.csv", encoding="latin1")

In [4]:
data = pd.concat([data.gender, data.description], axis=1)

In [5]:
data.head()

Unnamed: 0,gender,description
0,male,i sing my own rhythm.
1,male,I'm the author of novels filled with family dr...
2,male,louis whining and squealing and all
3,male,"Mobile guy. 49ers, Shazam, Google, Kleiner Pe..."
4,female,Ricky Wilson The Best FRONTMAN/Kaiser Chiefs T...


In [6]:
data.dropna(axis = 0, inplace=True)

In [7]:
data.gender = [1 if each =="female" else 0 for each in data.gender]

In [8]:
data.head()

Unnamed: 0,gender,description
0,0,i sing my own rhythm.
1,0,I'm the author of novels filled with family dr...
2,0,louis whining and squealing and all
3,0,"Mobile guy. 49ers, Shazam, Google, Kleiner Pe..."
4,1,Ricky Wilson The Best FRONTMAN/Kaiser Chiefs T...


In [9]:
## cleaning data

In [10]:
import re

In [11]:
fd = data.description[4]

In [12]:
fd

'Ricky Wilson The Best FRONTMAN/Kaiser Chiefs The Best BAND Xxxx Thank you Kaiser Chiefs for an incredible year of gigs and memories to cherish always :) Xxxxxxx'

In [13]:
des = re.sub("[^a-zA-Z]", " ",fd)

In [14]:
des

'Ricky Wilson The Best FRONTMAN Kaiser Chiefs The Best BAND Xxxx Thank you Kaiser Chiefs for an incredible year of gigs and memories to cherish always    Xxxxxxx'

In [15]:
des = des.lower()

In [16]:
des

'ricky wilson the best frontman kaiser chiefs the best band xxxx thank you kaiser chiefs for an incredible year of gigs and memories to cherish always    xxxxxxx'

In [17]:
## stopwords (irrevelant)

In [18]:
import nltk

In [19]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hakan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [20]:
from nltk.corpus import stopwords

In [23]:
des = des.split() #nltk.word_tokenize(des)

AttributeError: 'list' object has no attribute 'split'

In [24]:
des

['ricky',
 'wilson',
 'the',
 'best',
 'frontman',
 'kaiser',
 'chiefs',
 'the',
 'best',
 'band',
 'xxxx',
 'thank',
 'you',
 'kaiser',
 'chiefs',
 'for',
 'an',
 'incredible',
 'year',
 'of',
 'gigs',
 'and',
 'memories',
 'to',
 'cherish',
 'always',
 'xxxxxxx']

In [25]:
des = [word for word in des if not word in set(stopwords.words("english"))]

In [26]:
des

['ricky',
 'wilson',
 'best',
 'frontman',
 'kaiser',
 'chiefs',
 'best',
 'band',
 'xxxx',
 'thank',
 'kaiser',
 'chiefs',
 'incredible',
 'year',
 'gigs',
 'memories',
 'cherish',
 'always',
 'xxxxxxx']

In [28]:
## Lemmatization (kök bulma)

In [29]:
import nltk as nlp

In [30]:
lemma = nlp.WordNetLemmatizer()

In [32]:
des = [lemma.lemmatize(word) for word in des]

In [33]:
des

['ricky',
 'wilson',
 'best',
 'frontman',
 'kaiser',
 'chief',
 'best',
 'band',
 'xxxx',
 'thank',
 'kaiser',
 'chief',
 'incredible',
 'year',
 'gig',
 'memory',
 'cherish',
 'always',
 'xxxxxxx']

In [34]:
des = " ".join(des)

In [35]:
des

'ricky wilson best frontman kaiser chief best band xxxx thank kaiser chief incredible year gig memory cherish always xxxxxxx'

In [36]:
## data cleaning

In [38]:
des_list = []
for des in data.description:
    des = re.sub("[^a-zA-Z]", " ",des)
    des = des.lower()
    des = nltk.word_tokenize(des)
    des = [word for word in des if not word in set(stopwords.words("english"))]
    lemma = nlp.WordNetLemmatizer()
    des = [lemma.lemmatize(word) for word in des]
    des = " ".join(des)
    des_list.append(des)

In [41]:
des_list

['sing rhythm',
 'author novel filled family drama romance',
 'louis whining squealing',
 'mobile guy er shazam google kleiner perkins yahoo sprint pc airtouch air force stanford gsb uva dad husband brother golfer',
 'ricky wilson best frontman kaiser chief best band xxxx thank kaiser chief incredible year gig memory cherish always xxxxxxx',
 'know',
 'global marketplace image video music sharing photo inspiration design tip video creative community',
 'secret getting ahead getting started',
 'pll fan crazy mcd ramen bae',
 'renaissance art historian university nottingham fuelled haribo partial coffee soft spot renaissance china national teaching fellow',
 'clean food taste great providing energy nutrient guilt granola vegan paleo friendly option cert organic gf kosher',
 'highly extraordinary auction',
 'senior xi xii mmxiv',
 'come join fastest blog network online today http co mfpa vgk http co mpuuqtyf g cover credit repair credit card bankruptcy',
 'im p bo burnham disney world',
 

In [43]:
## bag of words

In [45]:
from sklearn.feature_extraction.text import CountVectorizer

In [69]:
max_features = 1500

In [70]:
c_v = CountVectorizer(max_features=max_features, stop_words="english")

In [71]:
sm = c_v.fit_transform(des_list).toarray()

In [53]:
print("en çok {} kelimee: {}".format(max_features, c_v.get_feature_names()))

en çok 500 kelimee: ['account', 'activist', 'actor', 'addict', 'adult', 'adventure', 'advocate', 'alum', 'amazing', 'america', 'american', 'angel', 'animal', 'anime', 'app', 'area', 'art', 'artist', 'ask', 'aspiring', 'author', 'award', 'away', 'awesome', 'baby', 'bad', 'band', 'based', 'beautiful', 'beauty', 'beer', 'believe', 'best', 'better', 'big', 'bio', 'bit', 'bitch', 'black', 'blog', 'blogger', 'blue', 'book', 'booking', 'born', 'bot', 'boy', 'brand', 'breaking', 'building', 'business', 'ca', 'car', 'care', 'cat', 'cause', 'ceo', 'certified', 'change', 'channel', 'check', 'chicago', 'chief', 'child', 'christ', 'christian', 'city', 'class', 'club', 'coach', 'coffee', 'college', 'com', 'come', 'comic', 'coming', 'communication', 'community', 'company', 'computer', 'conservative', 'consultant', 'contact', 'content', 'continuous', 'control', 'cool', 'country', 'county', 'crazy', 'create', 'creative', 'creator', 'culture', 'currently', 'dad', 'daily', 'dance', 'data', 'day', 'deal',



In [54]:
## text classification

In [72]:
y = data.iloc[:,0].values
x = sm

In [73]:
from sklearn.model_selection import train_test_split

In [74]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.1, random_state=42)

In [75]:
from sklearn.naive_bayes import GaussianNB

In [76]:
nb = GaussianNB()

In [77]:
nb.fit(x_train, y_train)

GaussianNB()

In [78]:
y_pred = nb.predict(x_test)

In [80]:
nb.score(x_test,y_test)

0.5822550831792976