# Analyse MapReduce Citibike - INF438

In [None]:
# Import des bibliothèques nécessaires
import sys
import os
from collections import defaultdict
import matplotlib.pyplot as plt
import pandas as pd

# Configuration pour l'affichage
plt.style.use('default')
%matplotlib inline

## Analyse 1: Top 10 Stations de Départ

In [None]:
# Mapper 1: Stations de départ
print("""#!/usr/bin/env python3
import sys

for line in sys.stdin:
    line = line.strip()
    if not line or line.startswith('"tripduration') or line.startswith('tripduration'):
        continue
    try:
        fields = line.split(',')
        start_station_name = fields[4]
        print("{}\\t1".format(start_station_name))
    except:
        pass
""")

In [None]:
# Reducer 1: Top 10 stations
print("""#!/usr/bin/env python3
import sys

station_counts = {}

for line in sys.stdin:
    key, count = line.strip().split('\\t')
    count = int(count)
    if key in station_counts:
        station_counts[key] += count
    else:
        station_counts[key] = count

top_10 = sorted(station_counts.items(), key=lambda x: x[1], reverse=True)[:10]

for station, count in top_10:
    print("{}\\t{}".format(station, count))
""")

### Résultats Analyse 1:
```
Top 10 stations:
"W 20 St & 11 Ave"      5983
"E 17 St & Broadway"    5621
"Broadway & W 58 St"    5401
"Broadway & E 14 St"    5177
"Broadway & W 24 St"    4955
"Central Park S & 6 Ave"        4925
"West Thames St"        4800
"West St & Chambers St" 4529
"Lafayette St & E 8 St" 4432
"8 Ave & W 31 St N"     4409
```

## Analyse 2: Types d'Utilisateurs

In [None]:
# Mapper 2

print("""#!/usr/bin/env python3
#mapper2.py
import sys

for line in sys.stdin:
    line = line.strip()
    if not line or line.startswith('"tripduration') or line.startswith('tripduration'):
        continue
    
    try:
        fields = line.split(',')
        usertype = fields[12]
        duration = fields[0]
        print("{}\t{}\t1".format(usertype, duration))
    except:
        pass
""")


In [None]:
# Reducer 2: Comparaison utilisateurs
print("""#!/usr/bin/env python3
import sys

current_key = None
total_duration = 0
count = 0

for line in sys.stdin:
    parts = line.strip().split('\\t')
    key = parts[0]
    duration = int(parts[1])
    
    if current_key == key:
        total_duration += duration
        count += 1
    else:
        if current_key:
            avg = total_duration / count if count > 0 else 0
            print("{}\\t{}\\t{:.2f}".format(current_key, count, avg))
        current_key = key
        total_duration = duration
        count = 1

if current_key:
    avg = total_duration / count if count > 0 else 0
    print("{}\\t{}\\t{:.2f}".format(current_key, count, avg))
""")

### Résultats Analyse 2:
```
"Customer"      240319  1753.23
"Subscriber"    337384  1101.42
```

## Analyse 3: Analyse Horaire

In [None]:
# Mapper 3: Analyse horaire
print("""#!/usr/bin/env python3
import sys

for line in sys.stdin:
    line = line.strip()
    if not line or line.startswith('"tripduration') or line.startswith('tripduration'):
        continue
    try:
        fields = line.split(',')
        start_time = fields[1]
        hour = start_time.split()[1].split(':')[0]
        print("{}\\t1".format(hour))
    except:
        pass
""")

In [None]:
# Reducer 3: Activité horaire + Top 5
print("""#!/usr/bin/env python3
import sys

hour_counts = {}

for line in sys.stdin:
    key, count = line.strip().split('\\t')
    count = int(count)
    if key in hour_counts:
        hour_counts[key] += count
    else:
        hour_counts[key] = count

for hour in sorted(hour_counts.keys()):
    print("{}\\t{}".format(hour, hour_counts[hour]))

print("\\n--- Top 5 heures les plus actives ---")
top_5 = sorted(hour_counts.items(), key=lambda x: x[1], reverse=True)[:5]
for hour, count in top_5:
    print("{}h: {} trajets".format(hour, count))
""")

### Résultats Analyse 3:
```
00      7799    07      15966   14      38108   21      21731
01      4359    08      32004   15      38166   22      16924
02      2738    09      29130   16      41207   23      12099
03      1605    10      23137   17      53334
04      1171    11      27249   18      53915
05      1931    12      34827   19      43077
06      7080    13      38151   20      31995

Top 5 heures les plus actives:
18h: 53915 trajets
17h: 53334 trajets  
19h: 43077 trajets
16h: 41207 trajets
15h: 38166 trajets
```

## Interprétation Analytique

**Analyse 1 - Stations populaires:** Les stations du centre de Manhattan dominent (Broadway, Avenue), indiquant une forte concentration d'activité dans les zones d'affaires et touristiques.

**Analyse 2 - Types d'utilisateurs:** Les Subscribers (337k trajets, 18min moyenne) sont plus nombreux mais font des trajets plus courts que les Customers (240k trajets, 29min moyenne). Les Subscribers sont probablement des usagers réguliers locaux.

**Analyse 3 - Patterns horaires:** Pic d'activité en fin d'après-midi (17h-19h) correspondant aux heures de sortie du travail. Activité minimale la nuit (3h-5h). Pattern typique d'un transport urbain professionnel.