In [1]:
# Imports and data loading
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

# Data paths
data_dir = r"."  # current notebook folder
train_path = os.path.join(data_dir, 'train.csv')
test_path = os.path.join(data_dir, 'test.csv')
sample_sub_path = os.path.join(data_dir, 'sample_submission.csv')

# Load
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
sample_sub = pd.read_csv(sample_sub_path)

print('train shape:', train.shape)
print('test shape:', test.shape)
print('sample_sub shape:', sample_sub.shape)

# Quick checks
print('\nLabel distribution (train):')
print(train['is_duplicate'].value_counts(normalize=True))

print('\nNulls in train:')
print(train[['question1','question2']].isnull().sum())

# Simple preprocessing
train['question1'] = train['question1'].fillna('')
train['question2'] = train['question2'].fillna('')
test['question1'] = test['question1'].fillna('')
test['question2'] = test['question2'].fillna('')

# Create combined text for TF-IDF fitting
all_questions = pd.concat([train['question1'], train['question2'], test['question1'], test['question2']]).astype(str)
print('\nTotal unique question strings to fit TF-IDF on:', all_questions.shape[0])

  test = pd.read_csv(test_path)


train shape: (404290, 6)
test shape: (3563475, 3)
sample_sub shape: (2345796, 2)

Label distribution (train):
is_duplicate
0    0.630802
1    0.369198
Name: proportion, dtype: float64

Nulls in train:
question1    1
question2    2
dtype: int64

Total unique question strings to fit TF-IDF on: 7935530


In [None]:
# TF-IDF features and baseline model

tfidf = TfidfVectorizer(max_features=50000, ngram_range=(1,2), analyzer='word')
# Fit on all question text
tfidf.fit(all_questions)

# Transform question1 and question2
q1_train = tfidf.transform(train['question1'].astype(str))
q2_train = tfidf.transform(train['question2'].astype(str))

# Simple feature: absolute difference and elementwise multiplication
from scipy.sparse import hstack
X_train = hstack([q1_train, q2_train, np.abs(q1_train - q2_train)])

y = train['is_duplicate'].values

# Train/validation split
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y, test_size=0.2, random_state=42, stratify=y)

model = LogisticRegression(max_iter=1000, solver='sag')
model.fit(X_tr, y_tr)

val_pred = model.predict_proba(X_val)[:,1]
val_loss = log_loss(y_val, val_pred)
print('Validation log loss:', val_loss)

# Prepare test features and predict
q1_test = tfidf.transform(test['question1'].astype(str))
q2_test = tfidf.transform(test['question2'].astype(str))
X_test = hstack([q1_test, q2_test, np.abs(q1_test - q2_test)])

test_pred = model.predict_proba(X_test)[:,1]

# Create submission (original)
submission = pd.DataFrame({'test_id': test['test_id'], 'is_duplicate': test_pred})
submission.to_csv('submission.csv', index=False)
print('Saved submission.csv with', submission.shape[0], 'rows')

# Create submission2 matching the format of sample_submission (preserve columns/order)
submission2 = sample_sub.copy()
# If sample_sub contains an 'is_duplicate' column, overwrite it; otherwise add it.
if 'is_duplicate' in submission2.columns:
    submission2['is_duplicate'] = test_pred
else:
    # If sample doesn't have the prediction column, try to align by index
    # Prefer keeping 'test_id' from sample_sub if present
    if 'test_id' in submission2.columns:
        submission2 = submission2[['test_id']].copy()
        submission2['is_duplicate'] = test_pred
    else:
        # Fallback: construct minimal submission with test ids
        submission2 = pd.DataFrame({'test_id': test['test_id'], 'is_duplicate': test_pred})

submission2.to_csv('submission2.csv', index=False)
print('Saved submission2.csv with', submission2.shape[0], 'rows; columns:', submission2.columns.tolist())

Validation log loss: 0.37693674557821594
Saved submission.csv with 3563475 rows


In [1]:
# New cell: create submission2 in the exact format of sample_submission (or fallback)
# This cell assumes you've already run the training/prediction cell so
# `sample_sub`, `test_pred`, and `test` are available in the notebook namespace.

import pandas as pd

# Try to use sample_sub in memory; if not present, try to read the sample file
if 'sample_sub' not in globals():
    try:
        sample_sub = pd.read_csv(sample_sub_path)
    except Exception:
        sample_sub = None

# Try to obtain predictions vector: prefer test_pred, else read submission.csv
preds = None
if 'test_pred' in globals():
    preds = test_pred
else:
    try:
        sub = pd.read_csv('submission.csv')
        if 'is_duplicate' in sub.columns:
            preds = sub['is_duplicate'].values
        else:
            # assume last column contains predictions
            preds = sub.iloc[:, -1].values
    except Exception:
        preds = None

# Build submission2 following sample_sub column layout
if sample_sub is not None and preds is not None:
    submission2 = sample_sub.copy()
    if 'is_duplicate' in submission2.columns:
        submission2['is_duplicate'] = preds
    else:
        # keep sample_sub columns/order but ensure we add the prediction column last
        submission2['is_duplicate'] = preds

elif preds is not None and 'test' in globals():
    # fallback: create using test DataFrame's test_id
    submission2 = pd.DataFrame({'test_id': test['test_id'], 'is_duplicate': preds})

elif preds is not None:
    # safest fallback: use submission.csv contents if available
    submission2 = pd.DataFrame({'is_duplicate': preds})

else:
    raise RuntimeError('No predictions found (no test_pred and no submission.csv). Run the prediction cell first.')

# Save
submission2.to_csv('submission2.csv', index=False)
print('Saved submission2.csv — rows:', submission2.shape[0], 'columns:', submission2.columns.tolist())

  sub = pd.read_csv('submission.csv')


Saved submission2.csv — rows: 3563475 columns: ['is_duplicate']


In [2]:
# New cell: create submission3 with binary labels (0/1) by thresholding at 0.5
# Assumes predictions are available in `test_pred` or in submission/submission2 files.
import os
import pandas as pd
import numpy as np

# Obtain predictions (probabilities)
preds = None
if 'test_pred' in globals():
    preds = np.asarray(test_pred)
else:
    # try submission.csv then submission2.csv
    for fname in ('submission.csv', 'submission2.csv'):
        if os.path.exists(fname):
            try:
                tmp = pd.read_csv(fname)
                if 'is_duplicate' in tmp.columns:
                    preds = tmp['is_duplicate'].values
                else:
                    preds = tmp.iloc[:, -1].values
                break
            except Exception:
                continue

if preds is None:
    raise RuntimeError('No predictions found. Run the prediction cell or ensure submission.csv/submission2.csv exists.')

# Binarize at threshold 0.5
bin_preds = (np.array(preds) >= 0.5).astype(int)

# Build submission3 following sample_submission's layout when possible
submission3 = None
if 'sample_sub' in globals() and isinstance(sample_sub, pd.DataFrame):
    submission3 = sample_sub.copy()
    # align lengths if needed
    if len(bin_preds) != len(submission3):
        minlen = min(len(bin_preds), len(submission3))
        print(f'Warning: length mismatch (preds={len(bin_preds)}, sample_sub={len(submission3)}). Aligning to min={minlen}.')
        submission3 = submission3.iloc[:minlen].copy()
        submission3['is_duplicate'] = bin_preds[:minlen]
    else:
        submission3['is_duplicate'] = bin_preds
elif os.path.exists('sample_submission.csv'):
    sample = pd.read_csv('sample_submission.csv')
    submission3 = sample.copy()
    if len(bin_preds) != len(submission3):
        minlen = min(len(bin_preds), len(submission3))
        print(f'Warning: length mismatch (preds={len(bin_preds)}, sample={len(submission3)}). Aligning to min={minlen}.')
        submission3 = submission3.iloc[:minlen].copy()
        submission3['is_duplicate'] = bin_preds[:minlen]
    else:
        submission3['is_duplicate'] = bin_preds
elif 'test' in globals() and 'test_id' in test.columns:
    submission3 = pd.DataFrame({'test_id': test['test_id'], 'is_duplicate': bin_preds})
else:
    submission3 = pd.DataFrame({'is_duplicate': bin_preds})

# Save
submission3.to_csv('submission3.csv', index=False)
print('Saved submission3.csv — rows:', submission3.shape[0], 'columns:', submission3.columns.tolist())

  tmp = pd.read_csv(fname)


Saved submission3.csv — rows: 2345796 columns: ['test_id', 'is_duplicate']


## How to run

1. Open this notebook in Jupyter or VS Code and run cells in order.
2. It will read `train.csv`, `test.csv`, and `sample_submission.csv` from the notebook folder.
3. The model is a quick baseline and will create `submission.csv` in the same folder.

Next steps

- Add cross-validation and more engineered features (word overlap, fuzzy matching, embeddings).
- Try neural methods (Siamese BiLSTM, SBERT) for stronger performance.