# Imports

In [1]:
# install required libraries
!pip install -r requirements.txt



In [2]:
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /home/jortg/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/jortg/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

# Loading the data

In [3]:
data_folder_name = 'A4data_2526_pan2020'
DATA_FOLDER = Path(data_folder_name)
DEV_SET = None
TEST_SET = None
TRAIN_SET = None

if not DATA_FOLDER.exists():
    print(f"ERROR: Data folder not found, make sure the '/{data_folder_name}' folder is located in the same folder as this notebook!")
else:
    try:
        DEV_SET = pd.read_csv(DATA_FOLDER / 'pan2526_dev_data.csv')
        TEST_SET = pd.read_csv(DATA_FOLDER / 'pan2526_test_data.csv')
        TRAIN_SET = pd.read_csv(DATA_FOLDER / 'pan2526_train_data.csv')
        print('Data loaded successfully')
    except FileNotFoundError:
        print(f"ERROR: Data not found in '/{DATA_FOLDER}'")
    

Data loaded successfully


Let's have a look at the DEV data:

In [4]:
DEV_SET

Unnamed: 0.1,Unnamed: 0,text,author
0,3329,The last part of the pictorial will now begin!...,560480
1,1225,"""It is a huge body of water."" It was slow yet ...",560480
2,3334,"As she reached for the door, the sound of a ca...",512464
3,3573,Despite the scowl that seemed to be forever pl...,2750536
4,1429,The traditional institutions did not take the ...,1112924
...,...,...,...
263,1432,"""Still thinking about it, Kuwabara-jiisan,"" Hi...",1112924
264,1838,"""Well, if it isn""t tha"" police kitteh."" Seras ...",1220273
265,3328,"""I think I have seen that clothes before..."" L...",560480
266,470,"""I almost didn""t turn away from you, even when...",1220273


# Extract features

In [5]:
def get_char_features(data_frame):
    # some char count fetaures
    
    data_frame['snippet_length'] = data_frame['text'].str.len()
    data_frame['comma'] = data_frame['text'].str.count(r',') / data_frame['snippet_length']
    data_frame['period'] = data_frame['text'].str.count(r'\.') / data_frame['snippet_length']
    data_frame['exclam'] = data_frame['text'].str.count(r'\!') / data_frame['snippet_length']
    data_frame['question'] = data_frame['text'].str.count(r'\?') / data_frame['snippet_length']
    data_frame['upper_case'] = data_frame['text'].str.count(r'[A-Z]') / data_frame['snippet_length']
    data_frame['lower_case'] = data_frame['text'].str.count(r'[a-z]') / data_frame['snippet_length']
    # some aggregated char count features
    
    data_frame['vowel_frequency'] = data_frame['text'].str.count(r'[aeiou]') / data_frame['snippet_length']
    data_frame['avg_word_len'] = data_frame['text'].apply(lambda x: sum(len(w) for w in x.split()) / len(x.split()) if x.split() else 0)
    
    feature_names = [
        'snippet_length', 
        'comma', 
        'period', 
        'exclam', 
        'question', 
        'upper_case',
        'lower_case',
        'vowel_frequency',
        'avg_word_len'
    ]
    
    return data_frame, feature_names

In [6]:
def get_word_features(data_frame):
    # count number of words
    data_frame['word_count'] = data_frame['text'].apply(
        lambda x: len(x.split())
    )
    # count the stopwords
    stop_words = set(stopwords.words('english'))
    data_frame['stopword_frequency'] = data_frame['text'].apply(
        lambda x: sum(1 for w in word_tokenize(x.lower()) if w in stop_words)
    ) / data_frame['word_count']
    
    feature_names = [
        'word_count', 
        'stopword_frequency'
    ]
    
    return data_frame, feature_names

In [7]:
def extract_features(data_frame):
    feature_names = []
    
    # extract some character based features
    data_frame, ft_names = get_char_features(data_frame)
    feature_names += ft_names
    
    # extract stop word based features
    data_frame, ft_names = get_word_features(data_frame)
    feature_names += ft_names
    
    
    return data_frame, feature_names

In [8]:
def train_decision_tree(data_frame):

    df_with_features, df_feature_names = extract_features(data_frame)
    df_with_features.head()

    X = df_with_features[df_feature_names]
    y = df_with_features['author']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model = DecisionTreeClassifier(random_state=42)
    model.fit(X_train, y_train)

    # Make predictions
    predictions = model.predict(X_test)
    print("Predictions:", predictions)

    # Check model accuracy
    print("Accuracy:", model.score(X_test, y_test))

In [9]:
train_decision_tree(DEV_SET)

Predictions: [1497577  870118  512464 1497577  560480 1497577 2855986   29783 1112924
 1112924  806976  583994 1497577  748687 2943978 6234395 6234395  806976
 1112924 1276465 3439302 1497577 1497577  512464  583064 1220273   29783
 6234395 2855986 2750536 2943978  910821  512464  748687  560480 2943978
 1276465 2855986 3439302  870118  806976 2750536  910821  583994 1220273
 1112924  967934 2750536  512464  910821 2855986 2750536 6234395 1276465]
Accuracy: 0.25925925925925924
