In [1]:
from collections import Counter
from IPython.display import FileLink
import pandas as pd
import numpy as np

In [2]:
def parse(filename, cols):
    # Returns a list of queries

    df = pd.read_csv(filename, sep=',', encoding='utf-8',
                     usecols=cols.keys(), dtype=cols, keep_default_na=False)

    query_col = 'search string cleaned'
    queries = []
    n = 0
    for idx, row in df.iterrows():
        n += 1
        for _ in range(row['searches']):
            queries.append(row[query_col])
    print('%s: Read %d rows' % (filename, n))
    return queries


In [3]:
def print_stats(qr):
    qd = set(qr)

    nchars_qr = np.array([len(x) for x in qr])
    nchars_qd = np.array([len(x) for x in qd])

    nwords_qr = np.array([len(x.split()) for x in qr])
    nwords_qd = np.array([len(x.split()) for x in qd])

    print('- Number of queries: %d' % len(qr))
    print('  - mean number of words: %.2f' % np.mean(nwords_qr))
    print('  - median number of words: %.2f' % np.median(nwords_qr))
    print('  - mean number of chars: %.2f' % np.mean(nchars_qr))
    print('  - median number of chars: %.2f' % np.median(nchars_qr))

    print('- number of distinct queries: %d' % len(qd))
    print('  - mean number of words: %.2f' % np.mean(nwords_qd))
    print('  - median number of words: %.2f' % np.median(nwords_qd))
    print('  - mean number of chars: %.2f' % np.mean(nchars_qd))
    print('  - median number of chars: %.2f' % np.median(nchars_qd))


In [4]:
# Zero result queries: Stats for the complete set
qr = parse('zero-23-24.csv', {
        'searches': np.int32,
        'search string cleaned': np.str_,
    })
            
print_stats(qr)
print()


# Zero result queries: Stats the dataset limited to match MTMTE1
qr = parse('zero-23-24-mtmte1.csv', {
        'searches': np.int32,
        'search string cleaned': np.str_,
    })
            
print_stats(qr)
print()

Whole dataset
zero-23-24.csv: Read 70149 rows
- Number of queries: 91057
  - mean number of words: 7.18
  - median number of words: 5.00
  - mean number of chars: 51.92
  - median number of chars: 37.00
- number of distinct queries: 65529
  - mean number of words: 7.28
  - median number of words: 5.00
  - mean number of chars: 52.31
  - median number of chars: 37.00

Selected range
zero-23-24-mtmte1.csv: Read 41042 rows
- Number of queries: 53496
  - mean number of words: 7.16
  - median number of words: 5.00
  - mean number of chars: 51.81
  - median number of chars: 37.00
- number of distinct queries: 38396
  - mean number of words: 7.28
  - median number of words: 5.00
  - mean number of chars: 52.35
  - median number of chars: 37.00



In [5]:
#Lage tilfeldig uttrekk for analyse

def parse_and_sample(filename, cols, sample_size, random_seed):
    # Read the CSV file
    df = pd.read_csv(filename, sep=',', encoding='utf-8',
                     usecols=cols.keys(), dtype=cols, keep_default_na=False)
    
    # Randomly sample 50 rows from the filtered data
    sampled_df = df.sample(n=sample_size, random_state=random_seed)
    
    # Ensure the specified columns are included
    specified_columns = ['date', 'searches', 'search string', 'search string cleaned', 
                         'field searched', 'search type', 'active tab', 'resource type pre-filter', 
                         'signed in', 'on campus']
    
    # Filter the DataFrame to include only the specified columns
    sampled_df = sampled_df[specified_columns]
    
    # Export the sampled rows to a new CSV file
    sampled_df.to_csv('zero-23-24-sample.csv', index=False)
    
    # Print the results
    print(sampled_df)
    
    # Create download link
    from IPython.display import FileLink
    display(FileLink('zero-23-24-sample.csv'))


In [6]:
#Kjører uttrekk
cols = {'year': int, 'month': int, 'date': str, 'search string': str, 'search string cleaned': str, 
        'field searched': str, 'search type': str, 'active tab': str, 'resource type pre-filter': str, 
        'signed in': str, 'on campus': str, 'searches': int}

#Filnavn, kolonnene over, antall rader, nummer på random seed
parse_and_sample('zero-23-24.csv', cols, 50, random_seed=4)



             date  searches  \
10985  2023-04-25         1   
67043  2024-11-28         1   
3732   2023-02-08         2   
11869  2023-05-04         2   
50392  2024-06-08         1   
2606   2023-01-28         1   
69832  2024-12-28         3   
68249  2024-12-11         1   
20117  2023-07-29         1   
395    2023-01-05         2   
33611  2023-12-17         1   
52052  2024-06-25         1   
25167  2023-09-21         1   
19913  2023-07-27         1   
8605   2023-03-31         2   
12268  2023-05-08         1   
27426  2023-10-14         1   
50292  2024-06-07         1   
44818  2024-04-12         1   
41561  2024-03-09         1   
25748  2023-09-27         1   
68282  2024-12-11         1   
19901  2023-07-27         1   
51964  2024-06-24         1   
27951  2023-10-19         1   
37180  2024-01-24         1   
8446   2023-03-30         2   
22027  2023-08-19         1   
15646  2023-06-13         2   
22184  2023-08-21         2   
40087  2024-02-23         1   
2228   2