In [81]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

from TurkishStemmer import TurkishStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import re


## This is the test run: How well does the scraper script do on one song?

In [82]:
url = 'http://sarki.alternatifim.com/sarkici/sezen-aksu/1980'
results = requests.get(url)

soup = BeautifulSoup(results.content, 'lxml')

artist_and_song_title = soup.find('h3', {'class':'baslik'}).text

artist = artist_and_song_title.split('-')[0].strip()
song_title = artist_and_song_title.split('-')[1].strip()

album_title = soup.find('tr').find('td', {'style':None}).text

song_text = soup.find('div', {"class":'sarkisozu'}).text

song_text = song_text[:song_text.find('*')].replace('\r', '').replace('\n', '').replace('/','')

print("Artist: ", artist)
print("Song Title: ", song_title)
print("Album: ", album_title)

artist_col = []
title_col = []
album_col = []
text_col = []

artist_col.append(artist)
title_col.append(song_title)
album_col.append(album_title)
text_col.append(song_text)

pd.DataFrame({
    'artist': artist_col,
    'title': title_col,
    'album': album_col,
    'text': text_col
})

Artist:  Sezen Aksu
Song Title:  1980
Album:  1980


Unnamed: 0,artist,title,album,text
0,Sezen Aksu,1980,1980,Sigaramın dumanına sarsam saklasam seni Gitme ...


In [83]:
test_text = song_text.title()

In [158]:
test_text

'Sigaramın Dumanına Sarsam Saklasam Seni Gitme Gitme Gittiğin Yollardan Dönülmez Geri Gitme Gitme El Olursun Sevdiğim Incitir Beni  Yokluğun Ah Yol Yol Olsa Uzasa Unutmam Seni Gitme Gitme Gittiğin Yollardan Dönülmez Geri Gitme Gitme El Olursun Sevdiğim Incitir Beni  Akşam Vakti Sardı Yine Hüzünler Kalbim Yangın Yeri Gel Kurtar Beni Senden Akşam Vakti Dolaştım Sokaklarda Yırtık Bir Afiş Seni Gördüm Duvarda  Sigaramın Dumanına Sarsam Saklasam Seni Yokluğun Ah Yol Yol Olsa Uzasa Unutmam Seni Gitme Gitme Gittiğin Yollardan Dönülmez Geri Gitme Gitme El Olursun Sevdiğim Incitir Beni'

In [155]:
def text_process(text):
    '''
    Takes in a string of text, then performs the following:
        1. Tokenizes and removes punctuation
        2. Removes stopwords
        3. Stems
        4. Returns a list of the cleaned text
    '''
    if pd.isnull(text):
        return []
    # tokenizing and removing punctuation
    tokenizer = RegexpTokenizer(r'\w+')
    text_processed=tokenizer.tokenize(text)
    
    # removing any stopwords
    text_processed = [word.lower() for word in text_processed if word.lower() not in stopwords.words('turkish')]
    
    # stemming
    stemmer = TurkishStemmer()
    
    text_processed = [stemmer.stem(word) for word in text_processed]
    
    try:
        text_processed.remove('b')
    except: 
        pass

    return " ".join(text_processed) ## <-- we're keeping our words distinct

In [159]:
text_process(test_text)

'sigara duman sar sakl sen git git gittik yol dönülmez ger git git el olur sevdik inci ben yokluk ah yol yol ols uzas unutma sen git git gittik yol dönülmez ger git git el olur sevdik inci ben akşa vakti sart yin hüz kalbim yang yer gel kurtar ben sen akşa vakti dolaş sokak yır bir afiş sen gört duvar sigara duman sar sakl sen yokluk ah yol yol ols uzas unutma sen git git gittik yol dönülmez ger git git el olur sevdik inci ben'

## Now testing the scraper for getting song names from one page: 

In [88]:
url = 'http://sarki.alternatifim.com/sarkici/sezen-aksu/sayfa-1'
results = requests.get(url)
if results.status_code !='200':
    pass
soup = BeautifulSoup(results.content, 'lxml')

list_items = soup.find('ul').find_all('li')

this_list = [i.find('a').attrs['href'] for i in list_items]

print("Scraped {} song URLS from this page. ".format(len(this_list)))

Scraped 25 song URLS from this page. 


## Building a master list for Sezen Aksu song URLs on this site:

In [89]:
master_list = []
for i in range(1,8):
    url = 'http://sarki.alternatifim.com/sarkici/sezen-aksu/sayfa-' + str(i)
    results = requests.get(url)
    if results.status_code !='200':
        pass
    soup = BeautifulSoup(results.content, 'lxml')
    list_items = soup.find('ul').find_all('li')
    this_list = [i.find('a').attrs['href'] for i in list_items]
    master_list += this_list

print("Scraped {} songs for Sezen Aksu on this site.".format(len(master_list)))

Scraped 171 songs for Sezen Aksu on this site.


## Here's the scraper script using the master list generated above.

In [120]:
artist_col = []
title_col = []
album_col = []
text_col = []


for item in master_list:
    url = 'http://sarki.alternatifim.com' + item
    results = requests.get(url)
    if results.status_code != '200':
        print('Error in scraping from', item)
        pass
    soup = BeautifulSoup(results.content, 'lxml')
    important_content = soup.find('div', {'class': 'ten columns cleft yazim'})
    title_info = important_content.find('div', {'class':'cbottom'})
    artist_and_song_title = title_info.find('h3').text
    artist = artist_and_song_title.split('-')[0].strip()
    song_title = artist_and_song_title.split('-')[1].strip()
    try:
        album_title = title_info.find('table').find('tbody').find('tr').find_all('td')[2].text
    except:
        album_title = np.nan
        
    song_text = important_content.find('div', {"class":'sarkisozu'}).text

    song_text = song_text[:song_text.find('/*')].replace('\r', ' ').replace('\n', '  ').replace('/',' ')
    artist_col.append(artist)
    title_col.append(song_title)
    album_col.append(album_title)
    text_col.append(song_text)

## Putting this into a DataFrame:

In [121]:
df = pd.DataFrame({
    'artist': artist_col,
    'title': title_col,
    'album': album_col,
    'text': text_col
})

## Building the scraper, there was a built-in way to add NAs when the album wasn't there. That information seems to be the most commonly missing.

In [122]:
# df.isna().sum()

# df['album'].value_counts()

In [123]:
df[(df['album']=='Allahaısmarladık') | (df['album']=='Allahaısmarladık (1977)')]

Unnamed: 0,artist,title,album,text
16,Sezen Aksu,Alev Alev,Allahaısmarladık (1977),Ellerin ne kadar sıcak Tıpkı ateş gibi Bakı...
18,Sezen Aksu,Allahaısmarladık,Allahaısmarladık,Yıllar yılı seviştik de neden mutlu olmadık...
19,Sezen Aksu,Allahaşkına,Allahaısmarladık,Sen istedin diye değil seni sevdiğim için S...
20,Sezen Aksu,Allahın Varsa,Allahaısmarladık,Yaz bitti yine mevsim sonbahar Kim çeker ki...
36,Sezen Aksu,Bir Kış Masalı,Allahaısmarladık,Bir su damlası ürperir tenimde Bir temas ha...
53,Sezen Aksu,Geçen Yaz,Allahaısmarladık (1977),İşte geçti bir yaz daha Geçti gitti rüzgar ...
69,Sezen Aksu,Gözlerindeki Bulut,Allahaısmarladık (1977),Gözlerinde bir bulut var bugün Sanki bir şe...
81,Sezen Aksu,İlk Gün Gibi,Allahaısmarladık,Gitsen de olur söylemem Söylemem tek bir ke...
126,Sezen Aksu,Seni Gidi Vurdumduymaz,Allahaısmarladık,Bu kaçıncı ayrılışın bakmadan gözyaşıma Bı...
144,Sezen Aksu,Söyle Kimsin,Allahaısmarladık (1977),Sen bir rüya bir resimsin Bir heyecan bir i...


## A scraper for getting year and track data on each album
The album part of the scraper seem particularly bad.  
I'm going to access the discogs API to make a second

In [124]:
endpoint = 'https://api.discogs.com/artists/97253/releases'

result = requests.get(endpoint)

sezen = result.json()

sezen.keys()

discography_df = pd.DataFrame(sezen['releases'])

discography_df.columns

disc_df = discography_df[['title','year','resource_url']]

# for endpoint in discography_df['resource_url']:
#     res = requests.get(endpoint)
#     res_json = res.json()
    
endpoint = 'https://api.discogs.com/releases/7156101'
res = requests.get(endpoint)
res_json = res.json()

a = pd.DataFrame(res_json['tracklist'])

a

Unnamed: 0,duration,position,title,type_
0,,A,Haydi Sansim,track
1,,B,Gel Bana,track


Process:
- I have a dataframe with the following important information: title, year, and resource_url
- Each resource_url leads to an api endpoint with a tracklist key
- I want a dataframe with columns album, year, track_title

In [125]:
disc_df.head(2)

Unnamed: 0,title,year,resource_url
0,Haydi Sansim / Gel Bana,1975,https://api.discogs.com/releases/7156101
1,Kusura Bakma / Yaşanmamış Yıllar,1976,https://api.discogs.com/releases/2359231


In [126]:
# # This script is meant to make the discography for sezen aksu. It takes some time so it also saves the dataframe.

# import time

# df_tracks = pd.DataFrame({
#   'title':[],
#   'album':[],
#   'year':[]
# })
# for i in range(disc_df.shape[0]):
# #     print(disc_df.iloc[i,:]['resource_url'])
#     this_endpoint = disc_df.iloc[i,:]['resource_url']
#     this_album = disc_df.iloc[i,:]['title']
#     this_year = disc_df.iloc[i,:]['year']
#     res = requests.get(this_endpoint)
#     res_json = res.json()
# #     print(res_json['tracklist'])
#     this_tracklist = pd.DataFrame(res_json['tracklist'])
# #     print(this_tracklist)
#     this_tracklist['album'] = this_album
#     this_tracklist['year'] = str(this_year)
#     try:
#         this_tracklist = this_tracklist.drop(['duration','position','type_','extraartists'], axis  = 1)
#     except:
#         this_tracklist = this_tracklist.drop(['duration','position','type_'], axis  = 1)
#     print("Adding {} to dataframe...".format(this_album))
#     print("Columns: {}".format(this_tracklist.columns))
#     print('\n')
#     df_tracks = df_tracks.append(this_tracklist, sort=True)
#     time.sleep(3)

# df_tracks.to_csv('sezen_discog.csv')

In [127]:
df_tracks = pd.read_csv('sezen_discog.csv', index_col = 0)

In [128]:
df_tracks.columns

Index(['album', 'title', 'year'], dtype='object')

In [129]:
for item in df['title']:
    if item in set(df_tracks['title']):
        print(item)

Ablam Aşktan Öldü
Adem Olan Anlar
Adı Menekşe
Ağlamak Güzeldir
Ah Mazi
Ahdım Olsun
Akasyalar Açarken
Alev Alev
Ali
Allahaısmarladık
Allahın Varsa
Aşk
Beşik
Bırak Beni
Bile Bile
Biliyorsun
Bir Acı Kahvenin
Bir Başka Aşk
Bir Çocuk Sevdim
Bir Kuş Uçur
Bir Zamanlar Deli Gönlüm
Bu Gece
Çile
Çocuk
Çocuklar Gibi
Farkındayım
Firuze
Gamsız
Geçen Yaz
Geçer
Geçiyor Bizden De
Gel Bana
Gelen Gideni Aratır
Geri Dön
Gidemem
Gidiyorum
Gidiyorum Bu Şehirden
Git
Gölge Etme
Gözlerindeki Bulut
Gözlerine Göz Değmiş
Gül
Güllerim Soldu
Gülümse
Gün Gelir
İçime Sinmiyor
İkili Delilik
İkinci Bahar
İlk Gün Gibi
İnce Mevzu
İstanbul Hatırası
İstanbul İstanbul Olalı
Kaç Yıl Geçti Aradan
Kaçın Kurası
Kaderim
Kahpe Kader
Kalaşnikof
Kalbim Ege'de Kaldı
Kalp Unutmaz
Kutlama
Küçüğüm
Küçük Yaz Çiçeği
Le Le Le
Lunapark
Masum Değiliz
Memet
Menajer
Minik Serçe
Namus
Ne Ağlarsın
Ne Kavgam Bitti Ne Sevdam
Neye Yarar
Nihayet
O Sensin
Seni Gidi Vurdumduymaz
Seni İstiyorum
Seni Kimler Aldı
Seni Yerler
Sensizim
Sızı
Silemezler Gö

In [130]:
df_merged = df.merge(df_tracks, how = 'left', on = 'title')

In [131]:
df_merged.columns

Index(['artist', 'title', 'album_x', 'text', 'album_y', 'year'], dtype='object')

In [132]:
df_merged.drop('album_y', axis = 1, inplace=True)

In [133]:
df_merged.columns = ['artist','title','album','text','year']

In [134]:
df_clean = df_merged[pd.notnull(df_merged['year'])]
df_clean['year'] = df_clean['year'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [135]:
df_clean

Unnamed: 0,artist,title,album,text,year
2,Sezen Aksu,Ablam Aşktan Öldü,şarkı söylemek lazım,Ablam aşktan öldü Herşey filmlerdeki gibi o...,2002
5,Sezen Aksu,Adem Olan Anlar,Deli Kızın Türküsü,Ben bu dünyaya bir türlü alışamadım Bu yüzd...,1993
6,Sezen Aksu,Adem Olan Anlar,Deli Kızın Türküsü,Ben bu dünyaya bir türlü alışamadım Bu yüzd...,2007
8,Sezen Aksu,Adı Menekşe,Adı Bende Saklı,"Bu şehrin meydanlarında, garında, rıhtımınd...",1998
9,Sezen Aksu,Ağlamak Güzeldir,Ağlamak Güzeldir,Aglamak guzeldir Suzulurken yaslar gozunden...,1981
12,Sezen Aksu,Ah Mazi,Git,Titresin bir mum alevinde o eski günler Bi...,1990
13,Sezen Aksu,Ah Mazi,Git,Titresin bir mum alevinde o eski günler Bi...,2007
14,Sezen Aksu,Ah Mazi,Git,Titresin bir mum alevinde o eski günler Bi...,2008
15,Sezen Aksu,Ahdım Olsun,Bahane,Geçti yıllar ah geç aydım Anladım ki boşa g...,2005
16,Sezen Aksu,Akasyalar Açarken,Serçe (1978),Yar yolunu kolladım İpek mendil salladım On...,1978


Problems: 
- lots of doubled songs. How can I just keep the earlier date?

In [141]:
df_clean = df_clean.sort_values('year', ascending=False).drop_duplicates(subset = ['title','text'], keep = 'last')

## Now time to tokenize

In [160]:
corpus = df_clean['text'].map(text_process)

In [172]:
corpus[216]

'lâl ol dil ben sen sor ben dak sır ol söz ben kulak ver rüzgârlara kent bul sen ben kent bil sen ben i̇ster yüz sür yağmur i̇ster anlat su kar kapa yol yol sor kar sorm kent bul sen ben kent bil sen ben lâl ol dil ben sen sor ben dak dünya sarhoş çocuk google_ad_client ca pub 7620071422727774'

In [192]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords


cvec = CountVectorizer(stop_words=stopwords.words("turkish") + ['bir']) 

In [193]:
cvec.fit(corpus)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None,
        stop_words=['acaba', 'ama', 'aslında', 'az', 'bazı', 'belki', 'biri', 'birkaç', 'birşey', 'biz', 'bu', 'çok', 'çünkü', 'da', 'daha', 'de', 'defa', 'diye', 'eğer', 'en', 'gibi', 'hem', 'hep', 'hepsi', 'her', 'hiç', 'için', 'ile', 'ise', 'kez', 'ki', 'kim', 'mı', 'mu', 'mü', 'nasıl', 'ne', 'neden', 'nerde', 'nerede', 'nereye', 'niçin', 'niye', 'o', 'sanki', 'şey', 'siz', 'şu', 'tüm', 've', 'veya', 'ya', 'yani', 'bir'],
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [194]:
transformed_corpus = cvec.transform(corpus)

In [195]:
word_table = pd.DataFrame(transformed_corpus.todense(), columns = cvec.get_feature_names()).drop(['7620071422727774', 'google_ad_client'], axis = 1)

In [196]:
word_table.describe()

Unnamed: 0,aah,aahhh,abla,acelen,acem,acı,acık,acıl,acılanma,acılaşır,...,şeref,şeytan,şiir,şikayets,şimdi,şimt,şiş,şte,şubat,şöyl
count,103.0,103.0,103.0,103.0,103.0,103.0,103.0,103.0,103.0,103.0,...,103.0,103.0,103.0,103.0,103.0,103.0,103.0,103.0,103.0,103.0
mean,0.019417,0.009709,0.009709,0.009709,0.009709,0.058252,0.009709,0.009709,0.009709,0.009709,...,0.009709,0.019417,0.029126,0.019417,0.019417,0.213592,0.009709,0.029126,0.009709,0.009709
std,0.197066,0.098533,0.098533,0.098533,0.098533,0.307592,0.098533,0.098533,0.098533,0.098533,...,0.098533,0.138662,0.168983,0.197066,0.138662,0.800128,0.098533,0.168983,0.098533,0.098533
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,2.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,2.0,1.0,6.0,1.0,1.0,1.0,1.0


In [197]:
pd.DataFrame(word_table.sum()).sort_values(by = 0, ascending = False)

Unnamed: 0,0
sen,142
ben,121
pub,74
ca,74
gün,51
göz,51
var,50
zaman,38
gel,38
olur,38
