# Dataset Cleaning Preparation

This Jupyter Notebook does the following:
1. Imports the unprocessed textbooks in the form txt files
1. Cleans and sentence tokenizes the texts
1. Extracts sentences containing the specified minority keywords
1. Writes the extracted sentences to their respective txt files for other sentiment analysis models to be used

In [1]:
import re
from nltk.text import Text
import numpy as np
import pipeline_caller

caller = pipeline_caller.PipelineCaller()

In [2]:
r_file_names = ['obl/txt/9tarih.txt', 'obl/txt/10tarih.txt', 'obl/txt/11tarih.txt', 'obl/txt/12inkilap.txt']

with  open(r_file_names[0], "r", encoding = "utf-8") as f1:
    raw_text1 = f1.read()

with  open(r_file_names[1], "r", encoding = "utf-8") as f2:
    raw_text2 = f2.read()
    
with  open(r_file_names[2], "r", encoding = "utf-8") as f3:
    raw_text3 = f3.read()
    
with  open(r_file_names[3], "r", encoding = "utf-8") as f4:
    raw_text4 = f4.read()

In [3]:
def initial_cleaner(text):
    cleaned_text = list(map(lambda x: x \
                     .replace("\n", " ") \
                     .replace("\uf0f5", "ı") \
                     .replace("\x0c", ""), text.split()))
    
    return cleaned_text

In [4]:
text9 = " ".join(initial_cleaner(raw_text1))
text10 = " ".join(initial_cleaner(raw_text2))
text11 = " ".join(initial_cleaner(raw_text3))
text12 = " ".join(initial_cleaner(raw_text4))

In [5]:
#Applying ITU NLP Sentence Splitter

text9_tkz = caller.call('sentencesplitter', text9, 'luUiBPs1DC9iAxrWlFLRDDN8atWtGGL7').splitlines()
text10_tkz = caller.call('sentencesplitter', text10, 'luUiBPs1DC9iAxrWlFLRDDN8atWtGGL7').splitlines()
text11_tkz = caller.call('sentencesplitter', text11, 'luUiBPs1DC9iAxrWlFLRDDN8atWtGGL7').splitlines()
text12_tkz = caller.call('sentencesplitter', text12, 'luUiBPs1DC9iAxrWlFLRDDN8atWtGGL7').splitlines()

In [9]:
minority_keywords = ["yunan", "ermeni", "kürt", "alevi", " arap", "boşnak", "çerkes", "çingene", "yahudi", "rum", "gayri", \
                    "hristiyan", "arnavut"]


txt9_minority_sentences = []

for sent in text9_tkz:
    for kw in minority_keywords:
        for word in sent.split():
            if word.lower().startswith(kw):
                txt9_minority_sentences.append(sent)
            else:
                continue
            

txt10_minority_sentences = []

for sent in text10_tkz:
    for kw in minority_keywords:
        for word in sent.split():
            if word.lower().startswith(kw):
                txt10_minority_sentences.append(sent)
            else:
                continue


txt11_minority_sentences = []
for sent in text11_tkz:
    for kw in minority_keywords:
        for word in sent.split():
            if word.lower().startswith(kw):
                txt11_minority_sentences.append(sent)
            else:
                continue

txt12_minority_sentences = []
for sent in text12_tkz:
    for kw in minority_keywords:
        for word in sent.split():
            if word.lower().startswith(kw):
                txt12_minority_sentences.append(sent)
            else:
                continue      

In [10]:
print("9th Grade Textbook Minority Sentence Counts:", len(txt9_minority_sentences), "\n",\
      "10th Grade Textbook Minority Sentence Counts:", len(txt10_minority_sentences), "\n",\
      "11th Grade Textbook Minority Sentence Counts:", len(txt11_minority_sentences), "\n",\
      "12th Grade Textbook Minority Sentence Counts:", len(txt12_minority_sentences))

9th Grade Textbook Minority Sentence Counts: 117 
 10th Grade Textbook Minority Sentence Counts: 206 
 11th Grade Textbook Minority Sentence Counts: 250 
 12th Grade Textbook Minority Sentence Counts: 591


In [11]:
txt12_minority_sentences[:10]

['3 Ermenilerin Faaliyetleri ve ASALA Terör Örgütü...................',
 'Bilgi Notu Bilgi Notu Ali Rıza Efendi’nin ailesi; Osmanlı Devleti’nin Rumeli’yi iskân siyaseti doğrultusunda, Anadolu’dan göç ettirilerek Makedonya’da Manastır ilinin Debre-i Bâlâ Sancağı’na yer- leştirilmiş Kocacık Yörüklerinden- dir.',
 'Zübeyde Hanım’ın ataları, Fatih Sultan Mehmet döneminde, Kon- ya yöresinden Rumeli’ye göç etti- rilmiş ‘Konyarlar’ diye adlandırılan Türkmenlerdendir.',
 'İz Bırakan Şehirler Selanik: Mustafa Kemal’in doğup büyüdüğü ve günümüzde Yunanistan’ın sınırları içerisinde kalan Selanik, Makedonya’nın sosyal, siyasi, ekonomik ve kültürel açıdan en gelişmiş şehriydi.',
 'Nüfusunun çoğunluğu Türk olan Selanik’te; Yunan, Ermeni, Bul- gar, Yahudi gibi çeşitli din, mezhep ve milletlerden insanlar bir arada yaşa- maktaydı.',
 'Nüfusunun çoğunluğu Türk olan Selanik’te; Yunan, Ermeni, Bul- gar, Yahudi gibi çeşitli din, mezhep ve milletlerden insanlar bir arada yaşa- maktaydı.',
 'Nüfusunun çoğun

In [15]:
def line_break_remover(text):
    processed_text = []
    for sent in text:
        words = sent.split()
        for w in words:
            if w.endswith("-"):
                ix = words.index(w)
                combined = (w + words[ix+1]).replace("-", "")
                processed_text.append(combined)
                words.pop(ix+1)
            else:
                processed_text.append(w)
                
    return processed_text

In [16]:
txt9_minority_sentences = " ".join(line_break_remover(txt9_minority_sentences))
txt10_minority_sentences = " ".join(line_break_remover(txt10_minority_sentences))
txt11_minority_sentences = " ".join(line_break_remover(txt11_minority_sentences))
txt12_minority_sentences = " ".join(line_break_remover(txt12_minority_sentences))

In [17]:
txt9_minority_sentences = caller.call('sentencesplitter', txt9_minority_sentences, 'luUiBPs1DC9iAxrWlFLRDDN8atWtGGL7').splitlines()
txt10_minority_sentences = caller.call('sentencesplitter', txt10_minority_sentences, 'luUiBPs1DC9iAxrWlFLRDDN8atWtGGL7').splitlines()
txt11_minority_sentences = caller.call('sentencesplitter', txt11_minority_sentences, 'luUiBPs1DC9iAxrWlFLRDDN8atWtGGL7').splitlines()
txt12_minority_sentences = caller.call('sentencesplitter', txt12_minority_sentences, 'luUiBPs1DC9iAxrWlFLRDDN8atWtGGL7').splitlines()

In [18]:
print("9th Grade Textbook Minority Sentence Counts:", len(txt9_minority_sentences), "\n", \
      "10th Grade Textbook Minority Sentence Counts:", len(txt10_minority_sentences), "\n", \
      "11th Grade Textbook Minority Sentence Counts:", len(txt11_minority_sentences), "\n",\
      "12th Grade Textbook Minority Sentence Counts:", len(txt12_minority_sentences))

9th Grade Textbook Minority Sentence Counts: 1 
 10th Grade Textbook Minority Sentence Counts: 1 
 11th Grade Textbook Minority Sentence Counts: 1 
 12th Grade Textbook Minority Sentence Counts: 1


In [None]:
#Writing minority sentences to txt files to use in other sentiment analysis approaches

with open('9thgrade_sentences.txt', 'w', encoding = 'utf-8') as f1:
    for sent in txt9_minority_sentences:
        f1.write("%s\n" % sent)
        
with open('10thgrade_sentences.txt', 'w', encoding = "utf-8") as f2:
    for sent in txt10_minority_sentences:
        f2.write("%s\n" % sent)
        
with open('11thgrade_sentences.txt', 'w', encoding = "utf-8") as f3:
    for sent in txt11_minority_sentences:
        f3.write("%s\n" % sent)
        
with open('12thgrade_sentences.txt', 'w', encoding = "utf-8") as f4:
    for sent in txt12_minority_sentences:
        f4.write("%s\n" % sent)