In [7]:
from TurkishStemmer import TurkishStemmer
from simplemma import text_lemmatizer
import json
import re


In [8]:
stop_words = None
with open("stopwords.txt", "r") as stop_file:
    stop_words = set(stop_file.read().splitlines())

skip_words = None
with open("skips.txt", "r") as skip_file:
    skip_words = set(skip_file.read().splitlines())


In [9]:
def clean_tweet(text):
    text = text.replace("Kur’an", "kuran")
    text = text.replace("â", "a")
    text = text.replace("î", "i")
    text = text.replace("İ", "i")
    text = text.replace("I", "ı")
    text = text.replace(u"\u00A0", " ")
    text = text.replace("|", " ")

    text = re.sub(r"@[A-Za-z0-9]+", " ", text)
    text = re.sub(r"(.)\1+", r"\1\1", text)
    text = re.sub(r"https?:\/\/\S+", " ", text)
    text = re.sub(r"http?:\/\/\S+", " ", text)
    text = re.sub(r"\n", " ", text)
    text = re.sub(r"#(\w+)", " ", text)
    text = re.sub(r"^\x00-\x7F]+", " ", text)
    text = re.sub(r"[^A-Za-zâîığüşöçİĞÜŞÖÇ]+", " ", text)
    text = re.sub(r"((https://[^\s]+))", " ", text)

    text = " ".join(text.lower().strip().split())

    return text


In [10]:
clean_tweets = {}

with open(f"data/tweets_merged.json", "r") as in_file:
    all_tweets = json.load(in_file)

for aday in all_tweets:
    clean_tweets[aday] = []
    tweets = all_tweets[aday]
    for tweet in tweets:
        cleaned_tweet = clean_tweet(tweet["rawContent"])
        if len(cleaned_tweet) > 0:
            clean_tweets[aday].append(cleaned_tweet)


In [11]:
with open(f"data/clean_tweets.json", "w") as out_file:
    json.dump(clean_tweets, out_file, indent=4, ensure_ascii=False)


In [12]:
all_tweets = None

with open(f"data/clean_tweets.json", "r") as in_file:
    all_tweets = json.load(in_file)


In [13]:
tweet_lemms = {}

for aday in all_tweets:
    tweet_lemms[aday] = []
    tweets = all_tweets[aday]
    for tweet in tweets:
        doc = text_lemmatizer(tweet, lang="tr")
        lemm_tweet = []
        for lemma in doc:
            if lemma not in stop_words:
                lemm_tweet.append(lemma)
                
        if len(lemm_tweet) > 0:
            tweet_lemms[aday].append(" ".join(lemm_tweet))

with open(f"data/lemm_tweets.json", "w") as out_file:
    json.dump(tweet_lemms, out_file, indent=4, ensure_ascii=False)

In [14]:
tweet_stems = {}
stemmer = TurkishStemmer()

for aday in all_tweets:
    tweet_stems[aday] = []
    tweets = all_tweets[aday]
    for tweet in tweets:
        words = tweet.split()
        stem_tweet = []
        for word in words:
            if word in stop_words:
                continue

            if len(word) <= 6 or word in skip_words:
                stem_tweet.append(word)
            else:
                stem_tweet.append(stemmer.stem(word))

        if len(stem_tweet) > 0:
            tweet_stems[aday].append(" ".join(stem_tweet))


with open(f"data/stem_tweets.json", "w") as out_file:
    json.dump(tweet_stems, out_file, indent=4, ensure_ascii=False)
