In [7]:
from __future__ import print_function
from collections import Counter
from IPython.display import FileLink
import pandas as pd
import numpy as np

In [8]:
def parse(filename, cols):
    # Returns a list of queries

    df = pd.read_csv(filename, sep=',', encoding='utf-8',
                     usecols=cols.keys(), dtype=cols, keep_default_na=False)
    
    query_col = 'search string cleaned'
    queries = []
    n = 0
    for idx, row in df.iterrows():
        n += 1
        for _ in range(row['searches']):
            queries.append(row[query_col])
    print('%s: Read %d rows' % (filename, n))
    return queries


In [9]:
def print_stats(qr):
    qd = set(qr)

    nchars_qr = np.array([len(x) for x in qr])
    nchars_qd = np.array([len(x) for x in qd])

    nwords_qr = np.array([len(x.split()) for x in qr])
    nwords_qd = np.array([len(x.split()) for x in qd])

    print('- Number of queries: %d' % len(qr))
    print('  - mean number of words: %.2f' % np.mean(nwords_qr))
    print('  - median number of words: %.2f' % np.median(nwords_qr))
    print('  - mean number of chars: %.2f' % np.mean(nchars_qr))
    print('  - median number of chars: %.2f' % np.median(nchars_qr))

    print('- number of distinct queries: %d' % len(qd))
    print('  - mean number of words: %.2f' % np.mean(nwords_qd))
    print('  - median number of words: %.2f' % np.median(nwords_qd))
    print('  - mean number of chars: %.2f' % np.mean(nchars_qd))
    print('  - median number of chars: %.2f' % np.median(nchars_qd))


In [11]:
# Popular queries set stats, whole set 
print('Whole dataset')
qr = parse('popular-23-24.csv', {
        'searches': np.int32,
        'search string cleaned': np.compat.unicode,
    })
print_stats(qr)
print()


#Popular queries set stats, the set limited to match MTMTE1
print('Selected range')
qr = parse('mtmte1-popular-23-24.csv', {
        'searches': np.int32,
        'search string cleaned': np.compat.unicode,
    })
print_stats(qr)
print()


Whole dataset
popular-23-24.csv: Read 6367 rows
- Number of queries: 151592
  - mean number of words: 2.75
  - median number of words: 1.00
  - mean number of chars: 19.48
  - median number of chars: 12.00
- number of distinct queries: 4145
  - mean number of words: 4.08
  - median number of words: 2.00
  - mean number of chars: 29.69
  - median number of chars: 19.00

Selected range
mtmte1-popular-23-24.csv: Read 4847 rows
- Number of queries: 117449
  - mean number of words: 2.72
  - median number of words: 1.00
  - mean number of chars: 19.23
  - median number of chars: 12.00
- number of distinct queries: 3261
  - mean number of words: 4.05
  - median number of words: 2.00
  - mean number of chars: 29.47
  - median number of chars: 19.00



In [12]:
# Find top 50 queries overall in the popular queries set
cols = {'year': int, 'month': int, 'search string cleaned': str, 'searches': int}
qr = parse('popular-23-24.csv', cols)

print()
qr_c = [[k, v] for k, v in Counter(qr).items()]
keys = [x[0] for x in qr_c]
counts = np.array([x[1] for x in qr_c], dtype=np.int32)
top50_idx = (np.argsort(counts)[-50:])[::-1]

for i, n in enumerate(top50_idx):
    print('%d. %s (%d)' % (i + 1, keys[n], counts[n]))
    
# Prepare data for export
export_data = []
for i, n in enumerate(top50_idx):
    export_data.append({'Rank': i + 1, 'Query': keys[n], 'Count': counts[n]})

# Convert to DataFrame
export_df = pd.DataFrame(export_data)

# Export to CSV
export_df.to_csv('popular-23-24-top-50.csv', index=False)

print()
print("Top 50 queries have been exported to 'popular-23-24-top-50.csv'.")

# Create download link
display(FileLink('popular-23-24-top-50.csv'))


popular-23-24.csv: Read 6367 rows

1. nel (20969)
2. pubmed (12184)
3. uhsutstyr (4564)
4. web of science (4026)
5. pressreader (3279)
6. scopus (2298)
7. uptodate (1637)
8. idunn (1526)
9. medline (1472)
10. borrehaugene i vestfold (686)
11. psycinfo (658)
12. det kvalitative forskningsintervju (630)
13. aftenposten (582)
14. worldcat (562)
15. spesialpedagogikk (541)
16. loeb (526)
17. factiva (495)
18. vite være gjøre (457)
19. up to date (444)
20. kvalitativ metode (440)
21. thematic analysis (434)
22. embase (411)
23. medical education a comparative study (389)
24. oed (371)
25. oxford english dictionary (369)
26. oxford classical dictionary (367)
27. avtalerett (365)
28. sivilprosess (348)
29. 65 jan m smits the mind and method of the legal academic (346)
30. menneskets fysiologi (331)
31. loeb classical library online (325)
32. google scholar (324)
33. introduksjon til samfunnsvitenskapelig metode (310)
34. test (305)
35. the economist (302)
36. john schostak interviewing and re

In [16]:
# Find top 50 queries within the limited csv-file

cols = {'year': int, 'month': int, 'search string cleaned': str, 'searches': int}
qr = parse('mtmte1-popular-23-24.csv', cols)

from collections import Counter
import numpy as np

# Assuming qr is the list of queries from the parse function
qr_c = [[k, v] for k, v in Counter(qr).items()]
keys = [x[0] for x in qr_c]
counts = np.array([x[1] for x in qr_c], dtype=np.int32)
top50_idx = (np.argsort(counts)[-50:])[::-1]

for i, n in enumerate(top50_idx):
    print('%d. %s (%d)' % (i + 1, keys[n], counts[n]))

    
# Prepare data for export
export_data = []
for i, n in enumerate(top50_idx):
    export_data.append({'Rank': i + 1, 'Query': keys[n], 'Count': counts[n]})

# Convert to DataFrame
export_df = pd.DataFrame(export_data)

# Export to CSV
export_df.to_csv('mtmte1-popular-23-24-top-50.csv', index=False)

print()
print("Top 50 queries have been exported to 'popular-23-24-mtmte1-top-50.csv'.")

# Create download link
display(FileLink('mtmte1-popular-23-24-top-50.csv'))


mtmte1-popular-23-24.csv: Read 4847 rows
1. nel (16509)
2. pubmed (10051)
3. uhsutstyr (3904)
4. web of science (3014)
5. pressreader (2515)
6. scopus (1830)
7. uptodate (1326)
8. idunn (1229)
9. medline (1188)
10. psycinfo (594)
11. det kvalitative forskningsintervju (545)
12. spesialpedagogikk (483)
13. worldcat (474)
14. aftenposten (454)
15. kvalitativ metode (408)
16. factiva (400)
17. loeb (399)
18. vite være gjøre (390)
19. up to date (369)
20. thematic analysis (362)
21. embase (354)
22. 65 jan m smits the mind and method of the legal academic (337)
23. oxford classical dictionary (332)
24. borrehaugene i vestfold (327)
25. avtalerett (301)
26. test (297)
27. menneskets fysiologi (293)
28. introduksjon til samfunnsvitenskapelig metode (289)
29. sivilprosess (287)
30. john schostak interviewing and representation in (281)
31. oed (278)
32. medical education a comparative study (275)
33. blunted opioid regulation of the hpa stress response during nicotine withdrawal therapeutic i