In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv("NLPCleanData.csv")
print(f"CSV shape: {df.shape}")
print(f"Number of unique authors: {df['Author'].nunique()}")
print(f"Total samples: {len(df)}")

CSV shape: (12, 3)
Number of unique authors: 12
Total samples: 12


In [2]:
df

Unnamed: 0,Author,Gender,Sample
0,Alfred Russel Wallace,Male,"solar heat and light, as entirely unsupported ..."
1,Amy Godine,Female,men fought slavery. Its centrality in their li...
2,Charles Fredrick Holder,Male,"houses, or any object that excited its ire, an..."
3,David Goulson,Male,frame as a contender to explain insect decline...
4,Frederick Courteney Selous,Male,there seems every reason to believe have died ...
5,Greta Thunberg,Female,to that challenge. But I want to ask you all t...
6,Henry David Thoreau,Male,part of the fruit is lost with the bloom which...
7,Jane Goodall,Female,moon. If I heard the coughing grunt of a leopa...
8,Rachel Carson,Female,"almost nonexistent, until such time as careful..."
9,Robin Wall Kimmerer,Female,people. She smiled kindly at him and instructe...


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Author  12 non-null     object
 1   Gender  12 non-null     object
 2   Sample  12 non-null     object
dtypes: object(3)
memory usage: 420.0+ bytes


In [4]:
le = LabelEncoder()
df['Gender'] = le.fit_transform(df['Gender'])  # Adds new column: Male=0, Female=1

In [5]:
# If you have 5,000 words per author, but only 15 rows total,
# then each row must contain ALL 5,000 words for one author
# We need to split each row into multiple samples

import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder
import nltk
from nltk.corpus import stopwords

# Download stopwords if needed, “the”, “of”, and “to”
try:
    stop_words = set(stopwords.words('english'))
except:
    nltk.download('stopwords')
    stop_words = set(stopwords.words('english'))

df = pd.read_csv("NLPCleanData.csv")

# Label encode thhe gener column
le = LabelEncoder()
df['Gender'] = le.fit_transform(df['Gender'])  # Now Male=0, Female=1

print("Gender encoding mapping:")
print(f"  {le.classes_[0]} → 0")
print(f"  {le.classes_[1]} → 1")

# Function to remove stopwords
def remove_stopwords(text):
    words = str(text).split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

# Apply stopword removal
df['Sample'] = df['Sample'].apply(remove_stopwords)

# Function to split text into N equal chunks, no overlap
def split_into_equal_chunks(text, num_chunks=10):
    words = text.split()
    total_words = len(words)
    
    if total_words == 0:
        return []
    
    # Calculate chunk size
    chunk_size = max(1, total_words // num_chunks)  # Ensure at least 1 word in each cheinl 
    
    chunks = []
    for i in range(num_chunks):
        start_idx = i * chunk_size
        end_idx = start_idx + chunk_size if i < num_chunks - 1 else total_words
        
        if start_idx < total_words:
            chunk = ' '.join(words[start_idx:end_idx])
            if chunk.strip():  
                chunks.append(chunk)
    
    return chunks

# Process each row to create more samples
new_rows = []
for idx, row in df.iterrows():
    text = str(row['Sample'])
    label = row['Gender']  # Now numeric (0 or 1)
    author = row['Author']
    
    # Split into 10 equal chunks, no overlap
    chunks = split_into_equal_chunks(text, num_chunks=10)
    
    for chunk_idx, chunk in enumerate(chunks):
        new_rows.append({
            'Author': f"{author}_chunk{chunk_idx}",
            'Gender': label,  # Already numeric 0/1
            'Sample': chunk,
            'Total Words': len(chunk.split()),
        })

# Create new DataFrame
new_df = pd.DataFrame(new_rows)

print(f"\nNew shape after splitting: {new_df.shape}")
print(f"Male (0): {sum(new_df['Gender'] == 0)} samples")
print(f"Female (1): {sum(new_df['Gender'] == 1)} samples")

# Save the expanded dataset
new_df.to_csv("NLPCleanData_Chunky.csv", index=False)
new_df

Gender encoding mapping:
  Female → 0
  Male → 1

New shape after splitting: (120, 4)
Male (0): 60 samples
Female (1): 60 samples


Unnamed: 0,Author,Gender,Sample,Total Words
0,Alfred Russel Wallace_chunk0,1,"solar heat light, entirely unsupported facts. ...",259
1,Alfred Russel Wallace_chunk1,1,"pupa exposed soft, semi-transparent condition,...",259
2,Alfred Russel Wallace_chunk2,1,"peculiar powers change colour adaptation are, ...",259
3,Alfred Russel Wallace_chunk3,1,"Animals,” (_Contributions Theory Natural Selec...",259
4,Alfred Russel Wallace_chunk4,1,"tint pattern; fly slowly, never attempt concea...",259
...,...,...,...,...
115,William T. Hornaday_chunk5,1,herd includes two fine bull calves dropped las...,280
116,William T. Hornaday_chunk6,1,"respecting herd, date November 1, 1888: ""The a...",280
117,William T. Hornaday_chunk7,1,"D. Nowell, North Platte, Nebraska, $100 pair, ...",280
118,William T. Hornaday_chunk8,1,"know precisely were, sad fate buffalo warned t...",280


In [6]:
new_df

Unnamed: 0,Author,Gender,Sample,Total Words
0,Alfred Russel Wallace_chunk0,1,"solar heat light, entirely unsupported facts. ...",259
1,Alfred Russel Wallace_chunk1,1,"pupa exposed soft, semi-transparent condition,...",259
2,Alfred Russel Wallace_chunk2,1,"peculiar powers change colour adaptation are, ...",259
3,Alfred Russel Wallace_chunk3,1,"Animals,” (_Contributions Theory Natural Selec...",259
4,Alfred Russel Wallace_chunk4,1,"tint pattern; fly slowly, never attempt concea...",259
...,...,...,...,...
115,William T. Hornaday_chunk5,1,herd includes two fine bull calves dropped las...,280
116,William T. Hornaday_chunk6,1,"respecting herd, date November 1, 1888: ""The a...",280
117,William T. Hornaday_chunk7,1,"D. Nowell, North Platte, Nebraska, $100 pair, ...",280
118,William T. Hornaday_chunk8,1,"know precisely were, sad fate buffalo warned t...",280


In [21]:
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, GroupShuffleSplit
new_df['Sample'] = (new_df['Sample'].str.replace(r'[^a-zA-Z\s]+', '', regex=True).str.lower())

X=new_df['Sample']
y=new_df['Gender']

X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=.2, random_state=1234)


vectorizer = TfidfVectorizer(
    max_features=35,  # best is
    ngram_range=(1, 1)  )

Xtrain = vectorizer.fit_transform(X_train).toarray()
Xtest = vectorizer.transform(X_test).toarray()
Ytrain = np.array(y_train)
Ytest = np.array(y_test)

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
log=LogisticRegression()
model=log.fit(Xtrain,Ytrain)
ypred=model.predict(Xtest)
acc=accuracy_score(Ytest,ypred)
tacc=model.score(Xtrain,Ytrain)

print(f"Accuracy is {acc}")
print(f"Training accuracy is {tacc}")

Accuracy is 0.9166666666666666
Training accuracy is 0.8958333333333334


array([[0.13725147, 0.        , 0.        , 0.        , 0.52809364,
        0.        , 0.19279136, 0.        , 0.16151291, 0.        ,
        0.38558271, 0.        , 0.        , 0.        , 0.        ,
        0.13725147, 0.16151291, 0.        , 0.        , 0.38558271,
        0.17603121, 0.17603121, 0.        , 0.        , 0.        ,
        0.        , 0.48453874],
       [0.12121351, 0.34052702, 0.34052702, 0.        , 0.1554618 ,
        0.1554618 , 0.        , 0.        , 0.28527996, 0.        ,
        0.        , 0.46638541, 0.        , 0.        , 0.17026351,
        0.        , 0.14263998, 0.1554618 , 0.17026351, 0.        ,
        0.1554618 , 0.1554618 , 0.17026351, 0.        , 0.34052702,
        0.17026351, 0.28527996],
       [0.        , 0.        , 0.        , 0.2910455 , 0.13287186,
        0.13287186, 0.14552275, 0.14552275, 0.12191316, 0.14552275,
        0.        , 0.        , 0.        , 0.14552275, 0.72761374,
        0.        , 0.36573948, 0.        , 0.1455