# import library

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

# Read Data PTA

In [2]:
dataPTA = pd.read_excel('PTAscrawl.xlsx')

In [3]:
dataPTA.head(10)

Unnamed: 0,Column1,Column2,Column3,Column4,Column5
0,Judul,Penulis,Dosen 1,Dosen 2,Abstrak
1,SISTEM PENDUKUNG KEPUTUSAN PEMILIHAN KARYAWAN ...,Penulis : Catur Ngesti Waluyo,"Dosen Pembimbing I : Mula'ab,S.Si.,M.Kom","Dosen Pembimbing II :Riza Alfita, S.T., M.T",Sumber daya manusia mutlak dibutuhkan untuk ke...
2,Perancangan Sistem Informasi Badan Kepegawaian...,Penulis : MALIKUL HAMZAH,"Dosen Pembimbing I : Moch. Kautsar Sophan, S.K...","Dosen Pembimbing II :Yeni Kustiyaningsih, S.Ko...",Kantor Badan Kepegawaian kota Bangkalan adalah...
3,IMPLEMENTASI ALGORITMA PRIM DAN DEPTH FIRST ...,Penulis : M Khoiril Anwar,"Dosen Pembimbing I : Cucun Very Angkoso, S.T.,...","Dosen Pembimbing II :Arik Kurniawati S. Kom., ...",Teknologi mobile game beroperating system open...
4,EKSTRAKSI FITUR BERBASIS TWO DIMENSIONAL LINEA...,Penulis : Muhammad Choirur Rozi,"Dosen Pembimbing I : Dr. Arif Muntasa, S.Si.,M.T","Dosen Pembimbing II :Fitri Damayanti, S.Kom.,M...",Sistem pengenalan wajah adalah suatu sistem un...
5,RANCANG BANGUN GAME PERAWATAN SAPI KARAPAN MEN...,Penulis : NURRACHMAT,"Dosen Pembimbing I : Arik Kurniawati, S.Kom., ...","Dosen Pembimbing II :Kurniawan Eka Permana, S....","Perkembangan game yang semakin pesat, memberik..."
6,AUTO LEVELING BERBASIS FINITE STATE MACHINE (F...,Penulis : Syafrial Nur Maulana,"Dosen Pembimbing I : Arik Kurniawati, S.Kom.,M.T.","Dosen Pembimbing II :Ari Kusumaningsih, S.T., ...",Pengetahuan tentang bahasa Inggris pada jaman ...
7,Gerak Pekerja Pada Game Real Time Strategy Men...,Penulis : Adi Chandra Laksono,"Dosen Pembimbing I : Kurniawan Eka P, S.Kom., Msc","Dosen Pembimbing II :Arik Kurniawati, S.Kom., ...",Gerak pekerja ada pada game yang memiliki genr...
8,PENGEMBANGAN MESIN PENCARIAN ANTIPLAGIASI PADA...,Penulis : ilham wibisono aziz,"Dosen Pembimbing I : Hermawan S.T.,M.Kom",Dosen Pembimbing II :Andharini Dwi Cahyani S.K...,ABSTRAK_x000D_\nPenjiplakan merupakan masalah ...
9,ANALISA VALIDITAS PENERIMA BEASISWA MENGGUNAKA...,Penulis : Hilmi Fairuz Abadi,"Dosen Pembimbing I : Bain Khusnul K., S.T., M....","Dosen Pembimbing II :Firli Irhamni, S.T., M.Kom.",Beasiswa adalah pemberian berupa bantuan keuan...


# Case Folding

In [4]:

# gunakan fungsi Series.str.lower() pada Pandas
dataPTA['Abstrak'] = dataPTA['Abstrak'].str.lower()

print('Case Folding Result : \n')

#cek hasil case fold
print(dataPTA['Abstrak'].head(5))
print('\n\n\n')

KeyError: 'Abstrak'

# Tokenization

In [None]:
#Import Library untuk Tokenisasi
import string 
import re #regex library

# import word_tokenize & FreqDist dari NLTK
from nltk.tokenize import word_tokenize 
from nltk.probability import FreqDist

def remove_PTA_special(text):
    # menghapus tab, new line, dan back slice
    text = text.replace('\\t'," ").replace('\\n'," ").replace('\\u'," ").replace('\\',"")
    # menghapus non ASCII (emoticon, chinese word, .etc)
    text = text.encode('ascii', 'replace').decode('ascii')
    # menghapus mention, link, hashtag
    text = ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)"," ", text).split())
    # menghapus incomplete URL
    return text.replace("http://", " ").replace("https://", " ")
                
dataPTA['Abstrak'] = dataPTA['Abstrak'].apply(remove_PTA_special)

#menghapus nomor
def remove_number(text):
    return  re.sub(r"\d+", "", text)

dataPTA['Abstrak'] = dataPTA['Abstrak'].apply(remove_number)

#menghapus punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans("","",string.punctuation))

dataPTA['Abstrak'] = dataPTA['Abstrak'].apply(remove_punctuation)

#menghapus spasi leading & trailing
def remove_whitespace_LT(text):
    return text.strip()

dataPTA['Abstrak'] = dataPTA['Abstrak'].apply(remove_whitespace_LT)

#menghapus spasi tunggal dan ganda
def remove_whitespace_multiple(text):
    return re.sub('\s+',' ',text)

dataPTA['Abstrak'] = dataPTA['Abstrak'].apply(remove_whitespace_multiple)

# menghapus kata 1 abjad
def remove_singl_char(text):
    return re.sub(r"\b[a-zA-Z]\b", "", text)

dataPTA['Abstrak'] = dataPTA['Abstrak'].apply(remove_singl_char)

# Tokenisasi
def word_tokenize_wrapper(text):
    return word_tokenize(text)

dataPTA['abstrak_token'] = dataPTA['Abstrak'].apply(word_tokenize_wrapper)

print('Tokenizing Result : \n') 
print(dataPTA['abstrak_token'].head())
print('\n\n\n')

# STOPWORD 

In [None]:
from nltk.corpus import stopwords

list_stopwords = stopwords.words('indonesian')

# Mengubah List ke dictionary
list_stopwords = set(list_stopwords)


#remove stopword pada list token
def stopwords_removal(words):
    return [word for word in words if word not in list_stopwords]

#aStopwording
dataPTA['abstrak_stop'] = dataPTA['abstrak_token'].apply(stopwords_removal) 


print(dataPTA['abstrak_stop'].head(20))

In [None]:
dataPTA.head()

In [None]:
dataPTA.to_excel("Text_Preprocessing.xlsx")

# TF Process

In [None]:
PTA = pd.read_excel("Text_Preprocessing.xlsx", usecols=["abstrak_stem"])
PTA.columns = ["Abstrak"]

PTA.head()

# Creating Dummy

In [None]:
def dummy(doc):
    return doc

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

#Membuat vektor object
vectorizer = CountVectorizer()

#Memasukkan semua abstrak ke dalam 1 list
dokumen = []
for term in PTA['Abstrak']:
    document = ''
    for link in term:
        document += link + ''
    dokumen.append(document)

# Vectorizing

In [None]:
vectorizer.fit(dokumen)

# Printing the identified Unique words along with their indices
print("Vocabulary: ", vectorizer.vocabulary_)

# Encode the Document
vector = vectorizer.transform(dokumen)

# Summarizing the Encoded Texts
print("Encoded Document is:")
print(vector.toarray())

# Mengurutkan data dan menyimpan dalam bentuk CSV

In [None]:
count = vector.toarray()
urut = sorted(vectorizer.vocabulary_)
df = pd.DataFrame(data=count,columns = urut)
df.to_csv('Hasil_TF.csv')