In [None]:
# 1. Check files in current working dir (Colab: /content)
import os, glob
files = glob.glob('/content/*')
print("Files at /content (filter .csv):")
for f in sorted(files):
    if f.lower().endswith('.csv'):
        print("  ", f)
# If your files are in a subfolder, list that too
print("\nAlso checking /content/drive if mounted...")
for f in sorted(glob.glob('/content/drive/**', recursive=True)[:50]):
    if f.lower().endswith('.csv'):
        print("  ", f)


Files at /content (filter .csv):
   /content/test.csv
   /content/train.csv
   /content/valid.csv

Also checking /content/drive if mounted...


In [None]:
!pip install -q scikit-learn pandas joblib


In [None]:
import os, joblib, numpy as np, pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, f1_score

In [None]:
# PATHS (Colab default upload location)
TRAIN_CSV = "/content/train.csv"
VALID_CSV = "/content/valid.csv"
TEST_CSV  = "/content/test.csv"
OUT_DIR = "/content/outputs"
os.makedirs(OUT_DIR, exist_ok=True)

# Quick existence checks
for p in (TRAIN_CSV, VALID_CSV, TEST_CSV):
    if not os.path.exists(p):
        raise FileNotFoundError(f"Missing expected file: {p}")

# Read
train_df = pd.read_csv(TRAIN_CSV)
valid_df = pd.read_csv(VALID_CSV)
test_df  = pd.read_csv(TEST_CSV)



print("Shapes:")
print("  train:", train_df.shape)
print("  valid:", valid_df.shape)
print("  test :", test_df.shape)
print("\nColumns (train):", train_df.columns.tolist())

Shapes:
  train: (70000, 2)
  valid: (10000, 2)
  test : (10000, 2)

Columns (train): ['labels', 'text']


In [None]:
import pandas as pd
for p in ['/content/train.csv','/content/valid.csv','/content/test.csv']:
    df = pd.read_csv(p)
    print(p, df.shape)
    print("cols:", df.columns.tolist())
    display(df.head(3))
    print("nulls:\n", df.isnull().sum())
    print('-'*40)


/content/train.csv (70000, 2)
cols: ['labels', 'text']


Unnamed: 0,labels,text
0,pt,"os chefes de defesa da estónia, letónia, lituâ..."
1,bg,размерът на хоризонталната мрежа може да бъде ...
2,zh,很好，以前从不去评价，不知道浪费了多少积分，现在知道积分可以换钱，就要好好评价了，后来我就把...


nulls:
 labels    0
text      0
dtype: int64
----------------------------------------
/content/valid.csv (10000, 2)
cols: ['labels', 'text']


Unnamed: 0,labels,text
0,nl,"""Ik ken geen druk,"" zei Mr. Feith, de ondersec..."
1,nl,Hier is mijn advies op basis van mijn persoonl...
2,es,"Por el precio, este reloj es inmejorable. Esti..."


nulls:
 labels    0
text      0
dtype: int64
----------------------------------------
/content/test.csv (10000, 2)
cols: ['labels', 'text']


Unnamed: 0,labels,text
0,nl,Een man zingt en speelt gitaar.
1,nl,De technologisch geplaatste Nasdaq Composite I...
2,es,Es muy resistente la parte trasera rígida y lo...


nulls:
 labels    0
text      0
dtype: int64
----------------------------------------


In [None]:
train = pd.read_csv('/content/train.csv')
valid = pd.read_csv('/content/valid.csv')
print(train['labels'].value_counts().head(30))   # top 30 labels
print("n_classes:", train['labels'].nunique())
print("valid contains unseen labels?", set(valid['labels']) - set(train['labels']))


labels
pt    3500
bg    3500
zh    3500
th    3500
ru    3500
pl    3500
ur    3500
sw    3500
tr    3500
es    3500
ar    3500
it    3500
hi    3500
de    3500
el    3500
nl    3500
fr    3500
vi    3500
en    3500
ja    3500
Name: count, dtype: int64
n_classes: 20
valid contains unseen labels? set()


In [None]:
# show samples
print(train[['labels','text']].sample(6, random_state=42).to_string(index=False))

# length stats
train['text_len'] = train['text'].fillna("").astype(str).str.len()
print("Avg length (train):", train['text_len'].mean())
print("Median length (train):", train['text_len'].median())
print("Min, max (train):", train['text_len'].min(), train['text_len'].max())

valid['text_len'] = valid['text'].fillna("").astype(str).str.len()
print("Avg length (valid):", valid['text_len'].mean())


labels                                                                                                                                                                                                                                                                                    text
    de                                                                                                                                                                                                                                                        Hab für den Preis mehr erwartet.
    ru                                                                                                                                                                                                                                          Письмо в письменном виде убийцы , друзья мои !
    hi                                                                                                                                     

In [None]:
# --- Imports ---
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib

# --- Load your 3 splits ---
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
valid_df = pd.read_csv("valid.csv")

# --- Quick checks ---
print(f"Train shape: {train_df.shape}, Test shape: {test_df.shape}, Valid shape: {valid_df.shape}")
print("Train columns:", train_df.columns.tolist())

# --- Extract text and labels ---
X_train, y_train = train_df["text"], train_df["labels"]
X_test, y_test = test_df["text"], test_df["labels"]
X_valid, y_valid = valid_df["text"], valid_df["labels"]

# --- Create and fit TF-IDF vectorizer on training data ---
vectorizer = TfidfVectorizer(
    analyzer="char",        # character-level n-grams
    ngram_range=(1, 5),     # captures short & long language patterns
    lowercase=True,
    sublinear_tf=True,
    max_features=100000,    # adjust for memory/speed tradeoff
)

print("Fitting TF-IDF on training set...")
X_train_vec = vectorizer.fit_transform(X_train)
print("Transforming test and validation sets...")
X_test_vec = vectorizer.transform(X_test)
X_valid_vec = vectorizer.transform(X_valid)

# --- Save vectorizer (optional) ---
joblib.dump(vectorizer, "tfidf_vectorizer_langid.joblib")
print("✅ TF-IDF vectorizer saved as tfidf_vectorizer_langid.joblib")

# --- Shapes summary ---
print(f"Train vectors: {X_train_vec.shape}")
print(f"Test vectors:  {X_test_vec.shape}")
print(f"Valid vectors: {X_valid_vec.shape}")


Train shape: (70000, 2), Test shape: (10000, 2), Valid shape: (10000, 2)
Train columns: ['labels', 'text']
Fitting TF-IDF on training set...
Transforming test and validation sets...
✅ TF-IDF vectorizer saved as tfidf_vectorizer_langid.joblib
Train vectors: (70000, 100000)
Test vectors:  (10000, 100000)
Valid vectors: (10000, 100000)


In [None]:
# TRAIN + EVALUATE + SAVE PIPELINE
import time
import joblib
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# X_train_vec, X_valid_vec, X_test_vec already exist (sparse matrices)
# y_train, y_valid, y_test exist (labels from your CSVs)

# --- Build classifier ---
clf = LogisticRegression(
    max_iter=1200,
    multi_class="multinomial",
    solver="saga",            # good for large sparse data
    C=2.0,
    n_jobs=-1,
    class_weight="balanced",  # helps with any imbalances
    verbose=1                 # optional: set 0 if you don't want output
)

# If you want a combined pipeline (vectorizer + clf) for saving, create it here.
# But since you already saved vectorizer separately, we'll save a pipeline at the end.
print("Classifier created:", clf)

# --- Train ---
t0 = time.time()
clf.fit(X_train_vec, y_train)
t_train = time.time() - t0
print(f"\nTraining finished in {t_train:.1f} seconds.\n")

# --- Evaluate on validation set ---
y_val_pred = clf.predict(X_valid_vec)
val_acc = accuracy_score(y_valid, y_val_pred)
print(f"Validation Accuracy: {val_acc:.4f}\n")
print("Validation classification report:")
print(classification_report(y_valid, y_val_pred, digits=4))

# --- Evaluate on test set ---
y_test_pred = clf.predict(X_test_vec)
test_acc = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {test_acc:.4f}\n")
print("Test classification report:")
print(classification_report(y_test, y_test_pred, digits=4))

# --- (Optional) Confusion matrix on a sampled subset to visualize ---
# If plotting is desired uncomment below:
# import matplotlib.pyplot as plt, seaborn as sns
# labels = sorted(clf.classes_)
# cm = confusion_matrix(y_test, y_test_pred, labels=labels)
# cm_norm = cm.astype('float') / (cm.sum(axis=1)[:, None] + 1e-12)
# plt.figure(figsize=(12,10))
# sns.heatmap(cm_norm, xticklabels=labels, yticklabels=labels, annot=False, cmap='Blues')
# plt.xlabel('Predicted'); plt.ylabel('Actual'); plt.title('Normalized confusion matrix');
# plt.show()

# --- Save the pipeline (vectorizer + classifier) for deployment ---
# If you want a single artifact, load the vectorizer and combine with the classifier:
vectorizer = joblib.load("tfidf_vectorizer_langid.joblib")  # we saved this earlier
pipeline = make_pipeline(vectorizer, clf)

model_path = "lang_detector_pipeline_v2_papluca_all20.joblib"
joblib.dump(pipeline, model_path, compress=3)
print(f"\nSaved pipeline to: {model_path} (includes vectorizer + classifier)")

# --- Quick inference examples with confidence and unknown threshold ---
def predict_with_confidence(texts, model=pipeline, threshold=0.35):
    probs = model.predict_proba(texts)
    preds = model.predict(texts)
    results = []
    for i, txt in enumerate(texts):
        prob = probs[i]
        idx = prob.argmax()
        pred = model.classes_[idx]
        conf = float(prob[idx])
        label = pred if conf >= threshold else "unknown"
        results.append((txt, label, pred, conf))
    return results

examples = [
    "This is a test sentence with numbers 2024 and url https://example.com",
    "Bonjour, je m'appelle Clara et j'aime le NLP.",
    "El producto cuesta 299 dólares.",
    "यह परीक्षण वाक्य है जिसमें संख्याएँ हैं 12345।",
    "这是一个包含数字2024的示例。",
    "Привет, как дела?"
]

print("\nExample predictions (threshold=0.35):")
for txt, label, pred, conf in predict_with_confidence(examples, threshold=0.35):
    print(f"- {txt[:80]}... -> predicted: {label} (top: {pred}, conf={conf:.3f})")


Classifier created: LogisticRegression(C=2.0, class_weight='balanced', max_iter=1200,
                   multi_class='multinomial', n_jobs=-1, solver='saga',
                   verbose=1)


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.


convergence after 22 epochs took 52 seconds

Training finished in 52.6 seconds.

Validation Accuracy: 0.9948

Validation classification report:
              precision    recall  f1-score   support

          ar     1.0000    0.9940    0.9970       500
          bg     0.9980    1.0000    0.9990       500
          de     1.0000    1.0000    1.0000       500
          el     1.0000    1.0000    1.0000       500
          en     0.9862    0.9980    0.9920       500
          es     1.0000    0.9940    0.9970       500
          fr     0.9980    1.0000    0.9990       500
          hi     1.0000    0.9520    0.9754       500
          it     0.9960    0.9980    0.9970       500
          ja     1.0000    0.9980    0.9990       500
          nl     0.9843    1.0000    0.9921       500
          pl     1.0000    0.9980    0.9990       500
          pt     0.9980    0.9980    0.9980       500
          ru     1.0000    0.9980    0.9990       500
          sw     0.9452    1.0000    0.9718  