<a href="https://colab.research.google.com/github/liron7722/AI-Generated-Text-Detector/blob/Production/tdIdf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import sys
import pandas as pd
from argparse import ArgumentParser

# Set default parameters
params = {
    "feature": "tdidf",
    "minGrams": 1,
    "maxGrams": 2,
}

# Parse command-line arguments (if provided)
if len(sys.argv) > 1:
    parser = ArgumentParser()
    parser.add_argument("--feature", required=True, help="Feature type (tdidf or bow)")
    parser.add_argument("--minGrams", type=int, required=True, help="Minimum n-gram size")
    parser.add_argument("--maxGrams", type=int, required=True, help="Maximum n-gram size")
    parser.add_argument("--maxFeatures", type=int, required=True, help="Maximum feature size")
    args = parser.parse_args()
    params.update(vars(args))

# Use the parameters
feature = params["feature"]
min_grams = params["minGrams"]
max_grams = params["maxGrams"]
max_features = params["maxFeatures"]

# Example: Print parameters to confirm
print(f"Feature: {feature}, MinGrams: {min_grams}, MaxGrams: {max_grams}")

# Load the data
data = pd.read_csv("data.csv")

# Apply TF-IDF or BOW logic based on the `feature` parameter
if feature == "tdidf":
    from sklearn.feature_extraction.text import TfidfVectorizer
    vectorizer = TfidfVectorizer(max_features=max_features, ngram_range=(min_grams, max_grams))
    transformed_data = vectorizer.fit_transform(data['text'])
elif feature == "bow":
    from sklearn.feature_extraction.text import CountVectorizer
    vectorizer = CountVectorizer(max_features=max_features, ngram_range=(min_grams, max_grams))
    transformed_data = vectorizer.fit_transform(data['text'])

# Save the output
output_file = f"{feature}.csv"
pd.DataFrame(transformed_data.toarray(), columns=vectorizer.get_feature_names_out()).to_csv(output_file, index=False)