# Step 1: Scrape & Clean

In [3]:
import praw
import pandas as pd
import re

# --- Cleaner Functions ---
def clean_text(text):
    # Remove URLs, mentions, junk
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'@\w+|#\w+', '', text)
    text = re.sub(r'[^\w\s.,!?]', '', text)  # Keep basic punctuation
    return text.strip()

def is_valid_greeklish(text):
    text = clean_text(text.lower())
    if len(text) < 3 or text in ["[deleted]", "[removed]"]:
        return False
    
    # Reject Greek/Cyrillic scripts
    if re.search(r'[α-ωά-ώΑ-Ω]', text) or re.search(r'[а-яА-Я]', text):
        return False
    
    # Reject excessive numbers/leet (e.g., "k4n315")
    if sum(c.isdigit() for c in text) > len(text) * 0.2:
        return False
    
    # Key Greeklish words (customize as needed)
    greeklish_words = {
        "kaneis", "einai", "thelw", "gia", "auto", "kai", "den", "ti", "mou",
        "sou", "tora", "minymata", "ellinika", "greeklish", "vlepw", "kserw"
    }
    words = set(re.findall(r'\b\w+\b', text))
    return len(words & greeklish_words) >= 2  # At least 2 Greeklish words

# --- Reddit Scraper ---
def scrape_reddit_greeklish():
    reddit = praw.Reddit(
        client_id="X3Z5GcOruHvrwQLCPCoaVg",
        client_secret="XEZIOW5ckfnH37UT2mE-Yfad3HPRZg",
        user_agent="GreeklishScraper/0.0.1",
    )

    posts = []
    for submission in reddit.subreddit("greece").search("greeklish", limit=10000):
        if is_valid_greeklish(submission.title):
            posts.append({"text": submission.title, "label": "greeklish", "source": "reddit"})
        
        submission.comments.replace_more(limit=0)
        for comment in submission.comments.list():
            if is_valid_greeklish(comment.body):
                posts.append({"text": comment.body, "label": "greeklish", "source": "reddit"})

    df = pd.DataFrame(posts)
    df.to_csv("greeklish_reddit_cleaned.csv", index=False)
    print(f"Saved {len(df)} valid Greeklish sentences.")

if __name__ == "__main__":
    scrape_reddit_greeklish()

Saved 187 valid Greeklish sentences.


In [4]:
df = pd.read_csv("greeklish_reddit_cleaned.csv")
print(f"Total Greeklish samples: {len(df)}")

Total Greeklish samples: 187


# Scraping English

In [5]:
import requests
import re
import pandas as pd

def scrape_gutenberg():
    # Pride and Prejudice - always available
    url = "https://www.gutenberg.org/files/1342/1342-0.txt"  
    response = requests.get(url)
    text = response.text
    
    sentences = []
    for sentence in re.split(r'(?<=[.!?])\s+', text):
        sentence = re.sub(r'[\r\n]+', ' ', sentence).strip()
        if (8 < len(sentence.split()) < 25 and 
            sentence.isascii() and
            not any(sentence.startswith(x) for x in ["CHAPTER", "***", "["])):
            sentences.append({
                "text": sentence,
                "label": "english",
                "source": "gutenberg"
            })
            if len(sentences) >= 150:  # Get all needed samples from this reliable source
                break
    
    pd.DataFrame(sentences).to_csv("english_gutenberg.csv", index=False)
    print(f"Collected {len(sentences)} classic English sentences from Gutenberg.")

scrape_gutenberg()

Collected 150 classic English sentences from Gutenberg.


In [6]:
import requests
import re
import pandas as pd

def scrape_wikipedia():
    WIKI_URL = "https://en.wikipedia.org/w/api.php"
    sentences = []
    
    while len(sentences) < 150:
        # Get random Wikipedia page
        params = {
            'action': 'query',
            'format': 'json',
            'list': 'random',
            'rnnamespace': 0,
            'rnlimit': 1
        }
        random_page = requests.get(WIKI_URL, params=params).json()
        page_title = random_page['query']['random'][0]['title']
        
        # Get page content
        params = {
            'action': 'query',
            'format': 'json',
            'titles': page_title,
            'prop': 'extracts',
            'explaintext': True
        }
        page_content = requests.get(WIKI_URL, params=params).json()
        text = next(iter(page_content['query']['pages'].values()))['extract']
        
        # Extract sentences
        for sentence in re.split(r'(?<=[.!?])\s+', text):
            if 10 < len(sentence.split()) < 30 and sentence.isascii():
                sentences.append({
                    "text": sentence.strip(),
                    "label": "english",
                    "source": "wikipedia"
                })
                if len(sentences) >= 150:
                    break
    
    pd.DataFrame(sentences).to_csv("english_wikipedia.csv", index=False)
    print(f"Collected {len(sentences)} English sentences from Wikipedia.")

scrape_wikipedia()

Collected 150 English sentences from Wikipedia.


In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("greeklish_reddit_cleaned.csv")
df_gutenberg = pd.read_csv("english_gutenberg.csv")
df_wikipedia = pd.read_csv("english_wikipedia.csv")
df_combined = pd.concat([df, df_gutenberg, df_wikipedia], ignore_index=True)
df_combined.to_csv("combined_dataset.csv", index=False)

In [10]:
df_combined.drop(columns=["source"], inplace=True)
df_combined.head()

Unnamed: 0,text,label
0,den eixa ellinika sto kinito mou gia xronia ka...,greeklish
1,ir8a gia na dw ta sxolia sta greeklish,greeklish
2,gia na katastrepsw tin ellhnikh koultoura kai ...,greeklish
3,Giati to turning point kata twn greeklish htan...,greeklish
4,Sinithos to Kano Gia nostalgikous logous me si...,greeklish


In [3]:
df_combined.shape

(487, 3)

In [13]:
!pip install nltk
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize

# Download the sentence tokenizer (only needed once)
# nltk.download('punkt_tab')


# Function to split paragraphs into sentences
def split_paragraphs_to_sentences(df, text_column):
    expanded_rows = []  # Store sentence-level rows

    for _, row in df.iterrows():
        sentences = sent_tokenize(str(row[text_column]))  # Tokenize sentences
        for sentence in sentences:
            expanded_rows.append({text_column: sentence, 'label': row['label']})

    return pd.DataFrame(expanded_rows)  # Create new DataFrame

# Apply sentence splitting
df_sentences = split_paragraphs_to_sentences(df_combined, 'text')

# Save the new dataset
df_sentences.to_csv("sentences_dataset.csv", index=False)

print("Saved dataset with sentences as 'sentences_dataset.csv'.")
print(df_sentences.head())  # Show first few rows


Saved dataset with sentences as 'sentences_dataset.csv'.
                                                text      label
0  den eixa ellinika sto kinito mou gia xronia ka...  greeklish
1  8ymamai na einai 2003 kai na vlepw greeklish n...  greeklish
2                      eixan ki afta tin plaka tous.  greeklish
3               episis, 8ym4741 k4n315 70 13375p34k?  greeklish
4                                                 :D  greeklish


In [14]:
df_sentences.shape

(728, 2)

In [15]:
# Preprocessing
df = pd.read_csv('sentences_dataset.csv') 
print("Initial data preview:")
print(df.head())
print(f"Total sentences: {len(df)}")

Initial data preview:
                                                text      label
0  den eixa ellinika sto kinito mou gia xronia ka...  greeklish
1  8ymamai na einai 2003 kai na vlepw greeklish n...  greeklish
2                      eixan ki afta tin plaka tous.  greeklish
3               episis, 8ym4741 k4n315 70 13375p34k?  greeklish
4                                                 :D  greeklish
Total sentences: 728


In [None]:
!pip install scikit-learn
import re
from time import sleep
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC  # Changed from LogisticRegression to SVM
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from google.colab import files
import joblib
import os

In [19]:
import pandas as pd
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import joblib

In [20]:
#  Preprocessing function
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans("", "", string.punctuation))  # Remove punctuation
    text = text.strip()
    return text

In [22]:
# Apply cleaning
df['text'] = df['text'].astype(str).apply(clean_text)
df.dropna(subset=['text'], inplace=True)  # Remove NaN sentences
df.drop_duplicates(subset=['text'], inplace=True)  # Remove duplicates

In [23]:
# Split data
X = df['text']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=40, stratify=y)

In [24]:
# Define TF-IDF + SVM pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 3), analyzer='char')),  # Character-level TF-IDF
    ('svm', SVC(kernel='rbf', probability=True))  # SVM with RBF kernel
])

In [25]:

# Train model
pipeline.fit(X_train, y_train)

In [26]:
# Predictions
y_pred = pipeline.predict(X_test)

In [28]:
# Evaluation
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label="greeklish")
recall = recall_score(y_test, y_pred, pos_label="greeklish")
f1 = f1_score(y_test, y_pred, pos_label="greeklish")

In [29]:
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

Accuracy: 0.9953
Precision: 1.0000
Recall: 0.9920
F1-score: 0.9960


In [30]:
# Save model
joblib.dump(pipeline, "greeklish_svm_model.pkl")
print("SVM model saved as 'greeklish_svm_model.pkl'")

SVM model saved as 'greeklish_svm_model.pkl'


In [31]:
# Load saved model
model = joblib.load("greeklish_svm_model.pkl")

# Function to classify new text
def predict_text(text):
    processed_text = clean_text(text)
    prediction = model.predict([processed_text])[0]
    return prediction

# Test examples
print(predict_text("ti kaneis"))  # Expected: Greeklish
print(predict_text("Hello, how are you?"))  # Expected: English


greeklish
english
