In [14]:
from common import *
import pandas as pd
from scipy.sparse import save_npz, load_npz
from src.representations import DoubleTfIdfVectorizer, SpacyNEClassifier
import os


In [9]:
def load_raw(data: Dataset, name):
    return pd.read_csv(DATA_DIR + data.value + "\\raw\\" + name + ".csv", nrows = 20)

In [10]:
ner = SpacyNEClassifier()
double_vect = DoubleTfIdfVectorizer(ner_clf=ner, min_df=1,
        max_df=20)
df = load_raw(Dataset.BBC, "train")

res = double_vect.fit_transform(df['text'])

In [11]:
save_npz(DATA_DIR + "npz-try.npz", res)


In [12]:
columns_df = pd.DataFrame({'col': double_vect.get_feature_names()})
columns_df.to_csv(DATA_DIR+ "npz-column.csv", index = False)

In [17]:
def save_as_npz (dataset: Dataset, state: State, name, vectorizer, df_raw: pd.DataFrame, sp_array):
    save_npz(DATA_DIR + dataset.value + "\\" + state.value + "\\" + "array-" + name + ".npz", sp_array)
    df_raw["target"].to_csv(DATA_DIR + dataset.value + "\\" + state.value + "\\" + "target-" + name + ".csv", index=False)
    validate_or_save_columns(dataset, state, vectorizer)

def validate_or_save_columns (dataset: Dataset, state: State, vectorizer):
    name = DATA_DIR + dataset.value + "\\" + state.value + "\\columns.csv"
    col_df = pd.DataFrame({"columns": vectorizer.get_feature_names()})
    if os.path.exists(name):
        cols = pd.read_csv(name)
        assert cols.shape == col_df.shape
    else:
        col_df.to_csv(name, index = False)

def read_as_dataframe(dataset: Dataset, state: State, name):
    data_dir = DATA_DIR + dataset.value + "\\" + state.value + "\\"
    sp_array = load_npz(data_dir + "array-" + name + ".npz" )
    cols = pd.read_csv(data_dir + "columns.csv")
    target = pd.read_csv(data_dir +  "target-" + name + ".csv")

    # sanity check
    assert len(cols) == sp_array.shape[1]
    assert len(target) == sp_array.shape[0]
    df = pd.DataFrame(sp_array.toarray(), columns=cols['columns'])
    df['TARGET'] = target['target']
    return df


In [18]:
save_as_npz(Dataset.AG_NEWS, State.BIO, "test-save-npz",double_vect, df, res)
read_as_dataframe(Dataset.AG_NEWS, State.BIO, "test-save-npz")

In [20]:
df2 = read_as_dataframe(Dataset.AG_NEWS, State.BIO, "test-save-npz")

print(df2.head())

columns,offer,video,demand,service,programme,tv,sky_NE,telewest_NE,pvr_NE,cable,...,dispute,radio_NE,spectator,butt,bigley_NE,edit,yes,angry,semitic_NE,TARGET
0,-0.069626,-0.078256,-0.071983,-0.052171,-0.057441,-0.069316,-0.067801,-0.067801,-0.067801,-0.067501,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,tech
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,politics
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,sport
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,tech
4,-0.006573,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,entertainment


In [22]:
df2.columns

Index(['offer', 'video', 'demand', 'service', 'programme', 'tv', 'sky_NE',
       'telewest_NE', 'pvr_NE', 'cable',
       ...
       'dispute', 'radio_NE', 'spectator', 'butt', 'bigley_NE', 'edit', 'yes',
       'angry', 'semitic_NE', 'TARGET'],
      dtype='object', name='columns', length=2027)

False

38